def store_as_csv(testing_article_indexes, testing_article_categories, output_file_name=OUTPUT_CSV_FILE): start_time = datetime.utcnow() log("Storing prediction results as csv in: {}".format(output_file_name)) with open(output_file_name, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow(['id', 'specialCoverage']) for index, category in zip(testing_article_indexes, testing_article_categories): spamwriter.writerow([index, category]) log("Done storing prediction csv in {}s.".format(seconds_since(start_time)))
def dump(self, export_file_name): start_time = datetime.utcnow() dump = { "index_to_family": self.index_to_family, "word_to_family_index": self.word_to_family_index, "ignored_words": list(self.ignored_words) } try: with open(export_file_name, mode='w') as output_file: json.dump(dump, output_file, cls=ComplexObjectSerializer) log("Done export of {} families and {} ignored words in {}s.".format(len(self.index_to_family.keys()), len(self.ignored_words), seconds_since(start_time))) except Exception as e: log("Could not export data to JSON file: {}. Reason: {}".format(export_file_name, e))
def load(self, export_file_name): start_time = datetime.utcnow() try: with open(export_file_name, mode='r') as input_file: dump = json.load(input_file) self.ignored_words = set(dump.get("ignored_words") or []) self.word_to_family_index = dump.get("word_to_family_index") or {} index_to_family_dict = dump.get("index_to_family") or {} for index, family_dict in index_to_family_dict.iteritems(): self.index_to_family.update({int(index): WordFamily(part_of_speech=family_dict.get(u"part_of_speech"), synonyms=set(family_dict.get(u"synonyms")))}) log("Done import of {} families and {} ignored words in {}s.".format(len(self.index_to_family.keys()), len(self.ignored_words), seconds_since(start_time))) except Exception as e: log("Could not import data from JSON file: {}. Reason: {}".format(export_file_name, e))
def find_and_store_similar_families(word_cache, filename=SIMILARITIES_FILE_NAME): SIMILARITY_THRESHOLD = 0.3 changes = [] start_time = datetime.utcnow() for outer_index, outer_family in word_cache.index_to_family.iteritems(): for inner_index, inner_family in word_cache.index_to_family.iteritems( ): if outer_index < inner_index: ratio = similarity(outer_family, inner_family) if ratio > SIMILARITY_THRESHOLD: log("similarity: {} between (#{}) {} and (#{}) {}".format( round(ratio, 2), inner_index, inner_family.synonyms, outer_index, outer_family.synonyms)) changes.append((outer_index, inner_index)) with open(filename, mode='w') as output_file: json.dump(changes, output_file) log("Found {} similarities in {}s!".format(len(changes), seconds_since(start_time))) return
def load_articles(file_name): """ :type file_name: str :rtype: list[model.article.Article | model.article.TrainingArticle] """ start_time = datetime.utcnow() articles = [] with open(file_name) as training_file: training_file_dicts = json.load(training_file, encoding=SOURCE_ENCODING) for training_file_dict in training_file_dicts: articles.append(article_to_model(training_file_dict)) articles = filter_nones(articles) log("Loaded {} dicts, translated to {} models ({}%) within {} seconds". format(len(training_file_dicts), len(articles), percentage(len(articles), len(training_file_dicts)), seconds_since(start_time))) return articles
def load_features(features_file_name=FEATURES_DUMP_FILE_NAME): """ :type features_file_name: str :rtype: list[model.ArticleFeatures.ArticleFeatures] """ start_time = datetime.utcnow() try: with open(features_file_name, mode='r') as input_file: features_dump = json.load(input_file) output_features = [] for feature in features_dump: article_id = int(feature.get("article_id")) word_family_index_to_occurences = {} for family_index, occurences in feature.get("word_family_index_to_occurences").iteritems(): if family_index is not None and occurences is not None: word_family_index_to_occurences.update({int(family_index): int(occurences)}) output_features.append(ArticleFeatures(article_id=article_id, word_family_index_to_occurences=word_family_index_to_occurences)) log("Done import of {} article's features in {}s.".format(len(output_features), seconds_since(start_time))) return output_features except Exception as e: log("Could not import features from JSON file: {}. Reason: {}".format(features_file_name, e)) return []
def store_features(article_features, features_file_name=FEATURES_DUMP_FILE_NAME): """ :type article_features: list[model.ArticleFeatures.ArticleFeatures] :type features_file_name: str """ start_time = datetime.utcnow() try: with open(features_file_name, mode='w') as output_file: json.dump(article_features, output_file, cls=ComplexObjectSerializer) log("Done export of {} article's features {}s.".format(len(article_features), seconds_since(start_time))) except Exception as e: log("Could not export article features data to JSON file: {}. Reason: {}".format(features_file_name, e))
for index, article in enumerate(articles): start_time = datetime.utcnow() log("Started parsing article #{}/{}: {}...".format(index, len(articles), article)) if article.id not in parsed_features: article_feature = ArticleFeatures(article.id) linked_text_to_translate = strip_tags(article.headline + " " + article.text) linked_text_to_translate = filter_words_shorter_than( linked_text_to_translate) all_words = process_article_to_words( " ".join(linked_text_to_translate)) log("Article #{}: {}\nall words ({}): {}...".format( index, article, len(all_words), crop_list_to_max(all_words))) for word in all_words: word_family_index = word_cache.add_word(word) if word_family_index is not None: article_feature.add_occurence(word_family_index) log("Ended parsing #{} article features: {}\narticle analyzed in: {}\n\n" .format(index, article_feature, seconds_since(start_time))) word_cache.dump(CACHE_DUMP_FILE) parsed_features.append(article_feature.article_id) article_features.append(article_feature) store_features(article_features=article_features, features_file_name=TESTING_SET_FEATURES_DUMP_FILE_NAME) else: log("Omitting parsing article #{}: {} - already parsed!".format( index, article)) log("Done!")