def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--lexicon') parser.add_argument('--outdir') args = parser.parse_args() lexicon = read_lexicon(args.lexicon) file_out = open(args.outdir + "filepath.out", "a") dtm_out = open(args.outdir + "dtm.out", "a") # the name of each file is YYYY_MM.txt.tok # we want files grouped by date for filename in sorted(glob.iglob(args.article_glob), key=os.path.basename): articles, article_index = LoadArticles(filename, False) for a, i in zip(articles, article_index): counter = {} words = a.split() for w in words: count = counter.get(w, 0) counter[w] = count + 1 dtm_out.write(str(len(counter)) + " ") for k in counter: dtm_out.write(lexicon[k] + ":" + str(counter[k]) + " ") dtm_out.write("\n") file_out.write(i[0] + " " + str(i[1]) + "\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--outfile') args = parser.parse_args() fp = open(args.outfile, "w") headings = ["documents", "date", "outlet"] fp.write(",".join(headings)) fp.write("\n") articles, indices = LoadArticles(args.article_glob) for a, i in zip(articles, indices): # we're making a csv so we don't want commas in the actual text # newlines mess up the R csv reader a = a.replace(",", "") a = a.replace("\n", " ") # we want the year/month and the newspaper # For russian Need to go one level up, file name is [path]/newspaper/year/file # dirname = os.path.dirname(os.path.dirname(i[0])) dirname = os.path.dirname(i[0]) base_dirname = os.path.basename(dirname) filename = os.path.basename(i[0]) # expects filename format to be year_month.txt.tok # we're going to call everything the 1st of the month cause R doesn't seem to like # dates with just a year/month year_month = filename.split('.')[0] splits = year_month.split('_') fp.write(a + "," + "01/" + splits[1] + "/" + splits[0] + "," + base_dirname + "\n") fp.close()
def do_english(args): stemmer = country_english_stemmer() # count number of external articles in each file if args.article_glob: countries = get_countries(args.country_list, stemmer) month_year_to_count = defaultdict(int) month_year_to_external = defaultdict(int) for filename in glob.iglob(args.article_glob): if "Correction" in filename: continue basename = os.path.basename(filename).split('.')[0] articles, _ = LoadArticles(filename) for a in articles: i, _ = contains_country(a.split(), countries) if i >= 2: month_year_to_external[basename] += 1 month_year_to_count[basename] += 1 fp = open(args.outfile, "w") for v in month_year_to_count: fp.write("%s %d %d\n" % (v, month_year_to_external[v], month_year_to_count[v])) fp.close() if args.reformat: pretty_format(args.outfile)
def do_russian(args): # stemmer = country_russian_stemmer() # TO UNDO stemmer = country_russian_stemmer() # count number of external articles in each file if args.article_glob: fp = open(args.outfile, "w") countries = get_countries(args.country_list, stemmer) for filename in glob.iglob(args.article_glob): external_count = 0 articles, _ = LoadArticles(filename) word_count = 0 for a in articles: s = a.split() i, _ = contains_country(s, countries, stemmer) # NOTE NEED TO UNDO THIS LATER. RIGHT NOW DOING THIS TO COUNT SENTIMENT # if i >= 2: # external_count += 1 external_count += i word_count += len(s) # TO UNDO, SHOULD BE LEN(ARTICLES) INSTEAD OF WORD_COUNT fp.write("%s %d %d\n" % (filename, external_count, word_count)) fp.close() if args.reformat: pretty_format(args.outfile)
def do_sub(article_name): articles, article_indices = LoadArticles(article_name) # we can always put this in places later filename = os.path.basename(article_name) new_filepath = os.path.join(output_dir, filename) fp = open(new_filepath, "a+") for text in articles: text = text.replace("\n", " ").replace("\r", " ") if lower: text = text.lower() if stem: words = text.split() new_words = [] for w in words: new_words.append(morph_stem(w)) text = " ".join(new_words) text = pattern.sub(lambda m: reg_subs[re.escape(m.group(0))], text) # write article fp.write(NEW_ARTICLE_TOKEN + "\n") fp.write(text) fp.write("\n\n") fp.close()
def get_top_words(filepath): noun_to_count = defaultdict(int) adj_to_count = defaultdict(int) all_to_count = defaultdict(int) if os.path.isfile("all_to_count.pickle"): all_to_count, noun_to_count, adj_to_count = pickle.load( open("all_to_count.pickle", "rb")) return noun_to_count, adj_to_count, all_to_count articles, _ = LoadArticles(filepath) for a in articles: words = a.split() for word in words: tag = get_tag(word) if not tag: all_to_count[word] += 1 continue if "NOUN" in tag: noun_to_count[word] += 1 elif "ADJF" in tag: adj_to_count[word] += 1 all_to_count[word] += 1 pickle.dump((all_to_count, noun_to_count, adj_to_count), open("all_to_count.pickle", "wb")) return noun_to_count, adj_to_count, all_to_count
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--output_dir') args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) articles, article_index = LoadArticles(args.article_glob) word_counts = Counter() # first pass, we stem everything and collect word counts for i in range(0, len(articles)): words = articles[i].split() new_words = [porter_stemmer.stem(w.decode('utf-8')) for w in words] word_counts.update( set(new_words) ) # we just want the count of how many articles each word occurs in articles[i] = ' '.join(new_words) # exclude any words that are too common or too uncommon min_thresh = 0.005 * len(articles) # .5% max_thresh = .995 * len(articles) # 99.5% print min_thresh, max_thresh exclude = set() for c in word_counts: if word_counts[c] < min_thresh or word_counts[c] > max_thresh: exclude.add(c) for e in exclude: print e.encode('utf-8') # second pass, we eliminate any words that we need to # and write articles for a, i in zip(articles, article_index): # cut exclude words words = a.split() new_words = [w.encode('utf-8') for w in words if not w in exclude] # build path to output file dirname = os.path.dirname(i[0]) filename = os.path.basename(i[0]) base_dirname = os.path.basename(dirname) topic_outdir = args.output_dir + "/" + base_dirname if not os.path.exists(topic_outdir): os.makedirs(topic_outdir) full_path = topic_outdir + "/" + filename # write article fp = open(full_path, 'a+') fp.write(NEW_ARTICLE_TOKEN + "\n") fp.write(" ".join(new_words) + "\n\n") fp.close()
def get_counts(article_glob): background_counter = Counter() usa_counter = Counter() articles, _ = LoadArticles(article_glob, verbose=False) for article in articles: article_counter = Counter(article.split()) background_counter.update(article_counter) if article_counter["USA"] >= 2: usa_counter.update(article_counter) return background_counter, usa_counter
def main(): out_f = open(args.log_file, "a") gold_vector = LoadVectors(args.gold_vectors)[0] out_f.write(args.per_month_glob) out_f.write("\n") for filename in sorted(glob.iglob(args.per_month_glob)): dirname = os.path.dirname(filename) year_month = os.path.basename(filename.replace(".txt.tok.all.lda", "")) vectors = LoadVectors(filename) articles, _ = LoadArticles(filename.replace(".all.lda", "")) similar_articles = GetSimilarArticles(articles, vectors, gold_vector, args.similarity_threshold) out_f.write("{}\t{}\t{}\n".format(year_month, len(similar_articles), len(vectors)))
def get_sentence_sample(filenames, code_to_lex, second_filter=None, sample_size=20): frame_to_sentences = defaultdict(list) num_articles = 0 total_articles = 0 # take a random sample across all dates for filename in filenames: articles, _ = LoadArticles(filename, verbose=False) for article in articles: article_counter = Counter(article.split()) if article_counter["USA"] < 2: continue total_articles += 1 if second_filter: if sum([article_counter[w] for w in second_filter]) < 2: continue num_articles += 1 for frame in code_to_lex: if sum([article_counter[w] for w in code_to_lex[frame]]) >= params.LEX_COUNT: frame_words = [ w for w in code_to_lex[frame] if article_counter[w] > 0 ] frame_to_sentences[frame].append((article, frame_words)) # sentences = sentence_tokenizer.tokenize(article) # for sentence in sentences: # sent_counter = Counter(sentence.lower().split()) # for frame in code_to_lex: # # filter relevant articles # if sum([article_counter[w] for w in code_to_lex[frame]]) < params.LEX_COUNT: # continue # if sum([sent_counter[w] for w in code_to_lex[frame]]) > 1: # frame_words = [w for w in code_to_lex[frame] if sent_counter[w] > 0] # frame_to_sentences[frame].append((sentence, frame_words)) print(num_articles / total_articles) for frame in code_to_lex: frame_to_sentences[frame] = random.sample(frame_to_sentences[frame], sample_size) return frame_to_sentences
def main(): parser = argparse.ArgumentParser() parser.add_argument('--file1') parser.add_argument('--file2') parser.add_argument('--outfile') parser.add_argument('--shortestn', type=int) parser.add_argument('--filenum', type=int, default=1) args = parser.parse_args() articles1, _ = LoadArticles(args.file1) labels1 = [l.strip() for l in open(args.file1 + ".labels").readlines()] set1 = zip(articles1, labels1) articles2, _ = LoadArticles(args.file2) labels2 = [l.strip() for l in open(args.file2 + ".labels").readlines()] set2 = zip(articles2, labels2) # if shortest n, we only want to keep the shortest n articles from each set if args.shortestn: set1.sort(key=lambda x: len(x[0])) set2.sort(key=lambda x: len(x[0])) merged_set = set1[:args.shortestn] + set2[:args.shortestn] else: merged_set = set1 + set2 random.shuffle(merged_set) # split the samples into specified number of files num_per_file = len(merged_set) / args.filenum for i in range(1, args.filenum + 1): filename = args.outfile + "_part_" + str(i) fp = open(filename, "w") fp_labels = open(filename + ".labels", "w") for j in range((i - 1) * num_per_file, i * num_per_file): fp.write(NEW_ARTICLE_TOKEN + "\n") fp.write(merged_set[j][0] + "\n\n") fp_labels.write(merged_set[j][1] + "\n") fp.close()
def load_data(filename): # Load articles from file articles, _ = LoadArticles(filename, verbose=False, split=True) # Create vocab dictionary for articles dct = Dictionary(articles) # dct.filter_extremes(no_below=5, no_above=500, keep_n=100000) dct.filter_extremes(no_below=5, no_above=500, keep_n=50000) # convert words to indices # we make UNK the highest index vocab_size = len(dct) articles = [dct.doc2idx(a, unknown_word_index=vocab_size) for a in articles] return articles, dct
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--outpath') # indicate what api key you want to start on (no point # in trying keys that we know we've burned) parser.add_argument('--api_start_idx', type=int, default=0) args = parser.parse_args() country_tags = set() continent_tags = set() annotator = TexterraAnnotator(args.api_start_idx) for filename in sorted(glob.iglob(args.article_glob)): # if we've already done this file, move on outfile_name = os.path.join(args.outpath, os.path.basename(filename) + ".pickle") if os.path.isfile(outfile_name): print("Already done", outfile_name) continue articles, _ = LoadArticles(filename) tags = annotator.annotate(articles) # tags = annotate_texterra_articles(articles) # clear out the crazy amount of extra text that this API returns for tag in tags: if not "annotations" in tag: continue if not "named-entity" in tag["annotations"]: continue for y in tag["annotations"]["named-entity"]: del y["annotated-text"] # cache these guys fp = open(outfile_name, "wb") pickle.dump(tags, fp) fp.close() # maybe want to get more tags eventually # for d in tags['annotations']['named-entity']: # if d['value']['tag'] == "GPE_COUNTRY": # country_tags.add(d['text']) # elif d['value']['tag'] == "LOCATION_CONTINENT": # continent_tags.add(d['text']) print("Done")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') args = parser.parse_args() articles, article_index = LoadArticles(args.article_glob, False) for a in articles: words = a.split() for w in words: lexicon.add(w) out_f = open("lexicon.txt", "a") for s in lexicon: out_f.write(s + "\n") out_f.close()
def main(): input_path = "/usr1/home/anjalief/corpora/russian/yearly_mod_subs/iz_lower/2*" frame_to_lex = pickle.load(open("frame_to_lex_final.pickle", "rb")) merged_frames = get_merged_frames(frame_to_lex) articles,indices = LoadArticles(input_path, verbose=False) sample = random.sample(range(0, len(articles)), 1000) # grab the first 10 that fit our requirements globalization = [] mitigation = [] other = [] for z in sample: s = articles[z] words = Counter(s.split()) if not words["USA"] >= 2: continue global_count = sum([words[w] for w in merged_frames["Economic"]]) miti_count = sum([words[w] for w in merged_frames["Strife"]]) if global_count < 2 and miti_count < 2: other.append((s, indices[z])) elif global_count >=2 and miti_count < 2: globalization.append((s, indices[z])) elif global_count < 2 and miti_count >= 2: mitigation.append((s, indices[z])) if len(globalization) >= 10 and len(mitigation) >= 10 and len(other) >= 10: break print (len(globalization), len(mitigation), len(other)) key = ['G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] all_articles = globalization[:10] + mitigation[:10] + other[:10] shuffle = random.sample(range(0, 30), 30) for i in shuffle: print (key[i], all_articles[i][1]) for i in shuffle: print ("NEW ARTICLE") print (all_articles[i][0].replace(".", ".\n")) print ("")
def get_article_top_words(input_file): dir_name = os.path.basename(os.path.dirname(os.path.dirname(input_file))) base_name = os.path.join("cache", dir_name + ".counter") if (os.path.isfile(base_name)): return pickle.load(open(base_name, "rb")) c = Counter() article_counter = Counter() num_articles = 0 articles, _ = LoadArticles(input_file) for a in articles: words = tokenize.word_tokenize(a) c.update(words) article_counter.update(set(words)) num_articles += 1 pickle.dump((c, num_articles, article_counter), open(base_name, "wb")) return c, num_articles, article_counter
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--country_list') parser.add_argument('--outfile') args = parser.parse_args() countries = get_countries(args.country_list) topic_to_articles = {} articles, article_index = LoadArticles(args.article_glob) incomplete_topics = set() for t in all_topics: incomplete_topics.add(t) for a, i in zip(articles, article_index): dirname = os.path.dirname(i[0]) base_dirname = os.path.basename(dirname) for topic in incomplete_topics: # consider this article for one arbitrary topic if topic in base_dirname: words = a.split() if contains_2_countries(words, countries): curr_list = topic_to_articles.get(topic, []) curr_list.append((a, i)) if len(curr_list) >= MAX_PER_TOPIC: incomplete_topics.remove(topic) topic_to_articles[topic] = curr_list break if len(incomplete_topics) == 0: break # output each topic to its own file for t in topic_to_articles: fp = open(args.outfile + "/" + t + ".txt", "w") for a in topic_to_articles[t]: fp.write(str(a[1]) + "\n" + a[0] + "\n\n") fp.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--out_file") parser.add_argument("--article_glob") parser.add_argument("--threshold", type=float) parser.add_argument("--use_edit_distance", action='store_true') args = parser.parse_args() articles, _ = LoadArticles(args.article_glob) count_matches = 0 distances = [] start = timeit.default_timer() # if we didn't specify a threshold, we don't know it yet # pull a random sample of 1000 articles and hope there # are some duplicates in there that can help us tune sample = min(len(articles), SAMPLE_SIZE) if not args.threshold: indices = random.sample(range(0, len(articles)), sample) # self-note, original was range(0, len(reader)) for i1 in range(0, sample - 1): for j1 in range(i1 + 1, sample): i = indices[i1] j = indices[j1] if args.use_edit_distance: distance = calculate_editdistance(articles[i], articles[j]) else: distance = calculate_cosdistance(articles[i], articles[j]) distances.append((i, j, distance)) distances.sort(key=lambda x: x[2]) stop = timeit.default_timer() print("TIME", stop - start) # print out lowest 100 distances for d in distances[0:100]: print(d[2]) print(articles[d[0]]) print(articles[d[1]]) print("************************************************")
def main(): input_path = "/usr1/home/anjalief/corpora/russian/Izvestiia/*/*_*.txt.tok" sum_num_articles = 0 sum_num_months = 0 token_count = 0 type_count = set() for filename in glob.iglob(input_path): sum_num_months += 1 articles, _ = LoadArticles(filename) sum_num_articles += len(articles) for a in articles: words = a.lower().split() token_count += len(words) type_count = type_count.union(set(words)) print("Average num articles", sum_num_articles / float(sum_num_months)) print("num articles", sum_num_articles) print("num tokens", token_count) print("num types", len(type_count))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--outfile', default='ner_out.txt') args = parser.parse_args() fp = open(args.outfile, "w") articles,_ = LoadArticles(args.article_glob) # TODO: spaCy has parallelization library for a in articles: lines = a.split("\n") # Write countries my list found and remove from article raw_countries = "" for i in range(len(lines) - 1, -1, -1): if "COUNTRIES" in lines[i]: raw_countries = lines[i] del lines[i] break fp.write(raw_countries + "\n") a = "\n".join(lines) write_spacy(a, fp)
def main(): gold_vector = LoadVectors(args.gold_vectors)[0] dirname_to_count = {} topic_to_count = {} for t in all_topics: topic_to_count[t] = (0, 0) for filename in glob.iglob(args.per_month_glob): dirname = os.path.dirname(filename) base_dirname = os.path.basename(dirname) vectors = LoadVectors(filename) articles, _ = LoadArticles(filename.replace(".50.lda", ""), False) similar_articles = GetSimilarArticles(articles, vectors, gold_vector, args.similarity_threshold) count = dirname_to_count.get(base_dirname, (0, 0)) dirname_to_count[base_dirname] = (count[0] + len(articles), count[1] + len(similar_articles)) # probably a better way to do this, but whatever for t in all_topics: if t in base_dirname: count = topic_to_count[t] topic_to_count[t] = (count[0] + len(articles), count[1] + len(similar_articles)) out_f = open(args.log_file + str(args.similarity_threshold) + ".sub.txt", "w") for k in dirname_to_count: v = dirname_to_count[k] out_f.write(k + " " + str(v[0]) + " " + str(v[1]) + "\n") out_f.close() out_f = open(args.log_file + str(args.similarity_threshold) + ".all.txt", "w") for k in topic_to_count: v = topic_to_count[k] out_f.write(k + " " + str(v[0]) + " " + str(v[1]) + "\n") out_f.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--country_list') parser.add_argument('--outfile') args = parser.parse_args() countries = get_countries(args.country_list) dirname_to_articles = {} articles, article_index = LoadArticles(args.article_glob) for a, i in zip(articles, article_index): dirname = os.path.dirname(i[0]) base_dirname = os.path.basename(dirname) # we don't need any more articles on this topic curr_list = dirname_to_articles.get(base_dirname, []) if len(curr_list) >= MAX_PER_TOPIC: continue words = a.split() count_this_article = contains_country(words, countries) if count_this_article: curr_list.append((a, i)) dirname_to_articles[base_dirname] = curr_list # output each topic to its own file for t in dirname_to_articles: fp = open(args.outfile + "/" + t + ".txt", "w") for a in dirname_to_articles[t]: fp.write(str(a[1]) + "\n" + a[0] + "\n\n") fp.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--outfile') args = parser.parse_args() fp = open(args.outfile, "w") headings = ["documents", "date"] fp.write(",".join(headings)) fp.write("\n") articles, indices = LoadArticles(args.article_glob) for a, i in zip(articles, indices): counter = Counter(a.split()) # Only run on US focused news if counter["USA"] < 2: continue # can probably get better topics with some better preprocessing # we're making a csv so we don't want commas in the actual text # newlines mess up the R csv reader a = a.replace(",", "") a = a.replace("\n", " ") # we want the year/month filename = os.path.basename(i[0]) # expects filename format to be year_month.txt.tok # we're going to call everything the 1st of the month cause R doesn't seem to like # dates with just a year/month year_month = filename.split('.')[0] splits = year_month.split('_') fp.write(a + "," + "01/" + splits[1] + "/" + splits[0] + "\n") fp.close()
#!/usr/bin/env python from article_utils import LoadArticles, LoadVectors from collections import Counter all = "/projects/tir1/users/anjalief/nyt_filtered/*/*.tok" articles, indices = LoadArticles(all, verbose=False) print "Loading done" all_counts = Counter() business_counts = Counter() for article, index in zip(articles, indices): folder = index[0].split("/") words = article.split() if folder[len(folder) - 2] == "Business": business_counts.update(words) else: all_counts.update(words) def write_counts(filename, counts): fp = open(filename, 'w') for c in counts: fp.write(c + " " + str(counts[c]) + "\n") fp.close() write_counts("all_counts.txt", all_counts) write_counts("business_counts.txt", business_counts)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--country_list') parser.add_argument('--labeled_data') parser.add_argument('--lemmatize_russian', action='store_true') args = parser.parse_args() stemmer = None if args.lemmatize_russian: stemmer = country_russian_stemmer() countries = get_countries(args.country_list, stemmer) if args.article_glob: article_to_count = {} articles, article_index = LoadArticles(args.article_glob) for a, i in zip(articles, article_index): words = a.split() count_this_article, _ = contains_country(words, countries, stemmer) count = 0 if count_this_article >= 2: count = 1 tup = article_to_count.get(i[0], (0, 0)) article_to_count[i[0]] = (tup[0] + count, tup[1] + 1) fp = open("article_to_count.txt", 'w') for t in article_to_count: fp.write(t + " " + str(article_to_count[t][0]) + " " + str(article_to_count[t][1]) + "\n") fp.close() if args.labeled_data: tracker = TrackCorrect() articles, _ = LoadArticles(args.labeled_data) labels = LoadGold(args.labeled_data + ".labels") internal_country_count = [] external_country_count = [] for a, l in zip(articles, labels): i, cl = texterra_count_countries(a) tracker.update(l.is_external, i >= 2) # count number of countries per article if l.is_external: external_country_count.append(i) else: internal_country_count.append(i) if not i and l.is_external: fp = open("false_negatives.txt", 'a+') fp.write(a) fp.write("\n******************* ") for c in cl: fp.write(c + " ") fp.write("\n\n") fp.close() elif i >= 2 and not l.is_external: print(a, "\n") print(cl, "\n\n\n") precision, recall, accuracy, gold_external, count = tracker.get_stats() print(precision, recall, accuracy, gold_external, count) print("EXTERNAL") for i in external_country_count: print(i) print("INTERNAL") for i in internal_country_count: print(i)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--article_glob') parser.add_argument('--file_name') parser.add_argument('--num_samples', type=int) parser.add_argument('--sample_size', type=int) # if we're filtering, it can take forever. Instead, we downsample to # some multiple of how many articles we actually need, then filter those, # then sample again from the filtered set. This saves us from # running the filtering on tons of articles parser.add_argument('--downsample', type=int) parser.add_argument('--country_list') parser.add_argument('--russian_ner', action='store_true') parser.add_argument('--label_file_exists', action='store_true') parser.add_argument('--merge_labels', action='store_true') args = parser.parse_args() articles, article_index = LoadArticles(args.article_glob) # see above note on pre-sampling if args.downsample: # we zip, downsample, and unzip articles, article_index = zip(*random.sample( list(zip(articles, article_index)), args.sample_size * args.num_samples * args.downsample)) print("DOWNSAMPLE", len(articles), len(article_index)) # if the articles are already labeled, then we split the labels # as we split the articles (i.e. instead of the article number) if args.label_file_exists: labels = [] for filename in glob.iglob(args.article_glob): l = open(filename + ".labels").readlines() labels += l article_index = labels # if we've given a country list, we only keep articles that have # a country name in them if args.country_list: stemmer = country_russian_stemmer() countries = get_countries(args.country_list, stemmer) # We need this wrappers because the filter functions take different arguments def include_by_country(a): return contains_country(a.split(), countries, stemmer) articles, article_index = filter_articles(articles, article_index, include_by_country) # probably not going to ever filter by both things so we can do a second pass if args.russian_ner: articles, article_index = filter_articles(articles, article_index, texterra_count_countries) # choose sample_size random articles print("POST FILTER", len(articles), len(article_index)) ran = random.sample(list(zip(articles, article_index)), args.sample_size * args.num_samples) index = 0 for i in range(0, args.num_samples): articles_fp = open(args.file_name + str(i), 'w+') labels_fp = open(args.file_name + str(i) + ".labels", 'w+') for i in range(0, args.sample_size): write_article(articles_fp, labels_fp, ran[index][0], ran[index][1], args.merge_labels) index += 1 articles_fp.close() labels_fp.close() print("DONE", args.article_glob)
prior[w] += 1 return result1, result2, prior stopwords = set() if args.stopwords: stopwords = LoadStopwords(args.stopwords) else: print("Not using stopwords") # this means we want to filter, only take articles that have country names if (args.country_list): countries = get_countries(args.country_list) # special casing this for now, atm we want prior to be combined dict articles1, _ = LoadArticles(args.first) counts1, prior = LoadFilteredCounts(articles1, countries, defaultdict(int)) articles2, _ = LoadArticles(args.second) counts2, prior = LoadFilteredCounts(articles2, countries, prior) elif args.single_country: # take all articles with country name as corpus 1 and articles without as corpus 2 articles1, _ = LoadArticles(args.first) counts1, counts2, prior = LoadSplitCounts(articles1, args.single_country) else: counts1 = LoadCounts(args.first, 0, stopwords) counts2 = LoadCounts(args.second, 0, stopwords) if args.prior: prior = LoadCounts(args.prior, args.min_count, stopwords) else: