Example #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--lexicon')
    parser.add_argument('--outdir')
    args = parser.parse_args()

    lexicon = read_lexicon(args.lexicon)
    file_out = open(args.outdir + "filepath.out", "a")
    dtm_out = open(args.outdir + "dtm.out", "a")

    # the name of each file is YYYY_MM.txt.tok
    # we want files grouped by date
    for filename in sorted(glob.iglob(args.article_glob),
                           key=os.path.basename):
        articles, article_index = LoadArticles(filename, False)
        for a, i in zip(articles, article_index):
            counter = {}
            words = a.split()
            for w in words:
                count = counter.get(w, 0)
                counter[w] = count + 1
            dtm_out.write(str(len(counter)) + " ")
            for k in counter:
                dtm_out.write(lexicon[k] + ":" + str(counter[k]) + " ")
            dtm_out.write("\n")
            file_out.write(i[0] + " " + str(i[1]) + "\n")
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--outfile')
    args = parser.parse_args()

    fp = open(args.outfile, "w")
    headings = ["documents", "date", "outlet"]
    fp.write(",".join(headings))
    fp.write("\n")
    articles, indices = LoadArticles(args.article_glob)
    for a, i in zip(articles, indices):
        # we're making a csv so we don't want commas in the actual text
        # newlines mess up the R csv reader
        a = a.replace(",", "")
        a = a.replace("\n", " ")

        # we want the year/month and the newspaper
        # For russian Need to go one level up, file name is [path]/newspaper/year/file
        # dirname = os.path.dirname(os.path.dirname(i[0]))
        dirname = os.path.dirname(i[0])
        base_dirname = os.path.basename(dirname)
        filename = os.path.basename(i[0])
        # expects filename format to be year_month.txt.tok
        # we're going to call everything the 1st of the month cause R doesn't seem to like
        # dates with just a year/month
        year_month = filename.split('.')[0]
        splits = year_month.split('_')
        fp.write(a + "," + "01/" + splits[1] + "/" + splits[0] + "," +
                 base_dirname + "\n")
    fp.close()
def do_english(args):
    stemmer = country_english_stemmer()

    # count number of external articles in each file
    if args.article_glob:
        countries = get_countries(args.country_list, stemmer)
        month_year_to_count = defaultdict(int)
        month_year_to_external = defaultdict(int)
        for filename in glob.iglob(args.article_glob):
            if "Correction" in filename:
                continue
            basename = os.path.basename(filename).split('.')[0]
            articles, _ = LoadArticles(filename)
            for a in articles:
                i, _ = contains_country(a.split(), countries)
                if i >= 2:
                    month_year_to_external[basename] += 1
                month_year_to_count[basename] += 1
        fp = open(args.outfile, "w")
        for v in month_year_to_count:
            fp.write("%s %d %d\n" %
                     (v, month_year_to_external[v], month_year_to_count[v]))
        fp.close()
    if args.reformat:
        pretty_format(args.outfile)
def do_russian(args):
    # stemmer = country_russian_stemmer() # TO UNDO
    stemmer = country_russian_stemmer()

    # count number of external articles in each file
    if args.article_glob:
        fp = open(args.outfile, "w")
        countries = get_countries(args.country_list, stemmer)
        for filename in glob.iglob(args.article_glob):
            external_count = 0
            articles, _ = LoadArticles(filename)
            word_count = 0
            for a in articles:
                s = a.split()
                i, _ = contains_country(s, countries, stemmer)
                # NOTE NEED TO UNDO THIS LATER. RIGHT NOW DOING THIS TO COUNT SENTIMENT
                #                if i >= 2:
                #                    external_count += 1
                external_count += i
                word_count += len(s)
                # TO UNDO, SHOULD BE LEN(ARTICLES) INSTEAD OF WORD_COUNT
            fp.write("%s %d %d\n" % (filename, external_count, word_count))
        fp.close()
    if args.reformat:
        pretty_format(args.outfile)
Example #5
0
def do_sub(article_name):
    articles, article_indices = LoadArticles(article_name)

    # we can always put this in places later
    filename = os.path.basename(article_name)
    new_filepath = os.path.join(output_dir, filename)

    fp = open(new_filepath, "a+")

    for text in articles:
        text = text.replace("\n", " ").replace("\r", " ")
        if lower:
            text = text.lower()
        if stem:
            words = text.split()
            new_words = []
            for w in words:
                new_words.append(morph_stem(w))
            text = " ".join(new_words)
        text = pattern.sub(lambda m: reg_subs[re.escape(m.group(0))], text)

        # write article
        fp.write(NEW_ARTICLE_TOKEN + "\n")
        fp.write(text)
        fp.write("\n\n")

    fp.close()
Example #6
0
def get_top_words(filepath):
    noun_to_count = defaultdict(int)
    adj_to_count = defaultdict(int)
    all_to_count = defaultdict(int)

    if os.path.isfile("all_to_count.pickle"):
        all_to_count, noun_to_count, adj_to_count = pickle.load(
            open("all_to_count.pickle", "rb"))
        return noun_to_count, adj_to_count, all_to_count

    articles, _ = LoadArticles(filepath)
    for a in articles:
        words = a.split()
        for word in words:
            tag = get_tag(word)
            if not tag:
                all_to_count[word] += 1
                continue
            if "NOUN" in tag:
                noun_to_count[word] += 1
            elif "ADJF" in tag:
                adj_to_count[word] += 1
            all_to_count[word] += 1
    pickle.dump((all_to_count, noun_to_count, adj_to_count),
                open("all_to_count.pickle", "wb"))
    return noun_to_count, adj_to_count, all_to_count
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--output_dir')
    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    articles, article_index = LoadArticles(args.article_glob)
    word_counts = Counter()

    # first pass, we stem everything and collect word counts
    for i in range(0, len(articles)):
        words = articles[i].split()
        new_words = [porter_stemmer.stem(w.decode('utf-8')) for w in words]
        word_counts.update(
            set(new_words)
        )  # we just want the count of how many articles each word occurs in
        articles[i] = ' '.join(new_words)

    # exclude any words that are too common or too uncommon
    min_thresh = 0.005 * len(articles)  # .5%
    max_thresh = .995 * len(articles)  # 99.5%
    print min_thresh, max_thresh
    exclude = set()
    for c in word_counts:
        if word_counts[c] < min_thresh or word_counts[c] > max_thresh:
            exclude.add(c)

    for e in exclude:
        print e.encode('utf-8')
    # second pass, we eliminate any words that we need to
    # and write articles
    for a, i in zip(articles, article_index):

        # cut exclude words
        words = a.split()
        new_words = [w.encode('utf-8') for w in words if not w in exclude]

        # build path to output file
        dirname = os.path.dirname(i[0])
        filename = os.path.basename(i[0])
        base_dirname = os.path.basename(dirname)

        topic_outdir = args.output_dir + "/" + base_dirname
        if not os.path.exists(topic_outdir):
            os.makedirs(topic_outdir)

        full_path = topic_outdir + "/" + filename

        # write article
        fp = open(full_path, 'a+')
        fp.write(NEW_ARTICLE_TOKEN + "\n")
        fp.write(" ".join(new_words) + "\n\n")
        fp.close()
def get_counts(article_glob):
    background_counter = Counter()
    usa_counter = Counter()
    articles, _ = LoadArticles(article_glob, verbose=False)
    for article in articles:
        article_counter = Counter(article.split())
        background_counter.update(article_counter)
        if article_counter["USA"] >= 2:
            usa_counter.update(article_counter)

    return background_counter, usa_counter
Example #9
0
def main():
  out_f = open(args.log_file, "a")
  gold_vector = LoadVectors(args.gold_vectors)[0]
  out_f.write(args.per_month_glob)
  out_f.write("\n")
  for filename in sorted(glob.iglob(args.per_month_glob)):
    dirname = os.path.dirname(filename)
    year_month = os.path.basename(filename.replace(".txt.tok.all.lda", ""))
    vectors = LoadVectors(filename)
    articles, _ = LoadArticles(filename.replace(".all.lda", ""))
    similar_articles = GetSimilarArticles(articles, vectors, gold_vector,
        args.similarity_threshold)
    out_f.write("{}\t{}\t{}\n".format(year_month, len(similar_articles), len(vectors)))
Example #10
0
def get_sentence_sample(filenames,
                        code_to_lex,
                        second_filter=None,
                        sample_size=20):
    frame_to_sentences = defaultdict(list)
    num_articles = 0
    total_articles = 0

    # take a random sample across all dates
    for filename in filenames:
        articles, _ = LoadArticles(filename, verbose=False)
        for article in articles:

            article_counter = Counter(article.split())
            if article_counter["USA"] < 2:
                continue

            total_articles += 1
            if second_filter:
                if sum([article_counter[w] for w in second_filter]) < 2:
                    continue

            num_articles += 1

            for frame in code_to_lex:
                if sum([article_counter[w]
                        for w in code_to_lex[frame]]) >= params.LEX_COUNT:
                    frame_words = [
                        w for w in code_to_lex[frame] if article_counter[w] > 0
                    ]
                    frame_to_sentences[frame].append((article, frame_words))

            # sentences = sentence_tokenizer.tokenize(article)

            # for sentence in sentences:
            #     sent_counter = Counter(sentence.lower().split())
            #     for frame in code_to_lex:

            #         # filter relevant articles
            #         if sum([article_counter[w] for w in code_to_lex[frame]]) < params.LEX_COUNT:
            #             continue

            #         if sum([sent_counter[w] for w in code_to_lex[frame]]) > 1:
            #             frame_words = [w for w in code_to_lex[frame] if sent_counter[w] > 0]
            #             frame_to_sentences[frame].append((sentence, frame_words))

    print(num_articles / total_articles)
    for frame in code_to_lex:
        frame_to_sentences[frame] = random.sample(frame_to_sentences[frame],
                                                  sample_size)
    return frame_to_sentences
Example #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--file1')
    parser.add_argument('--file2')
    parser.add_argument('--outfile')
    parser.add_argument('--shortestn', type=int)
    parser.add_argument('--filenum', type=int, default=1)
    args = parser.parse_args()


    articles1, _ = LoadArticles(args.file1)
    labels1 = [l.strip() for l in open(args.file1 + ".labels").readlines()]
    set1 = zip(articles1, labels1)

    articles2, _ = LoadArticles(args.file2)
    labels2 = [l.strip() for l in open(args.file2 + ".labels").readlines()]
    set2 = zip(articles2, labels2)

    # if shortest n, we only want to keep the shortest n articles from each set
    if args.shortestn:
        set1.sort(key=lambda x: len(x[0]))
        set2.sort(key=lambda x: len(x[0]))
        merged_set = set1[:args.shortestn] + set2[:args.shortestn]
    else:
        merged_set = set1 + set2
    random.shuffle(merged_set)

    # split the samples into specified number of files
    num_per_file = len(merged_set) / args.filenum
    for i in range(1, args.filenum + 1):
        filename = args.outfile + "_part_" + str(i)
        fp = open(filename, "w")
        fp_labels = open(filename + ".labels", "w")
        for j in range((i - 1) * num_per_file, i * num_per_file):
            fp.write(NEW_ARTICLE_TOKEN + "\n")
            fp.write(merged_set[j][0] + "\n\n")
            fp_labels.write(merged_set[j][1] + "\n")
        fp.close()
Example #12
0
def load_data(filename):
    # Load articles from file
    articles, _ = LoadArticles(filename, verbose=False, split=True)

    # Create vocab dictionary for articles
    dct = Dictionary(articles)
    # dct.filter_extremes(no_below=5, no_above=500, keep_n=100000)
    dct.filter_extremes(no_below=5, no_above=500, keep_n=50000)

    # convert words to indices
    # we make UNK the highest index
    vocab_size = len(dct)
    articles = [dct.doc2idx(a, unknown_word_index=vocab_size) for a in articles]

    return articles, dct
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--outpath')
    # indicate what api key you want to start on (no point
    # in trying keys that we know we've burned)
    parser.add_argument('--api_start_idx', type=int, default=0)
    args = parser.parse_args()

    country_tags = set()
    continent_tags = set()
    annotator = TexterraAnnotator(args.api_start_idx)
    for filename in sorted(glob.iglob(args.article_glob)):

        # if we've already done this file, move on
        outfile_name = os.path.join(args.outpath,
                                    os.path.basename(filename) + ".pickle")
        if os.path.isfile(outfile_name):
            print("Already done", outfile_name)
            continue

        articles, _ = LoadArticles(filename)
        tags = annotator.annotate(articles)
        #        tags = annotate_texterra_articles(articles)

        # clear out the crazy amount of extra text that this API returns
        for tag in tags:
            if not "annotations" in tag:
                continue
            if not "named-entity" in tag["annotations"]:
                continue
            for y in tag["annotations"]["named-entity"]:
                del y["annotated-text"]

        # cache these guys
        fp = open(outfile_name, "wb")
        pickle.dump(tags, fp)
        fp.close()

        # maybe want to get more tags eventually
        # for d in tags['annotations']['named-entity']:
        #     if d['value']['tag'] ==  "GPE_COUNTRY":
        #         country_tags.add(d['text'])

        #     elif d['value']['tag'] == "LOCATION_CONTINENT":
        #         continent_tags.add(d['text'])

    print("Done")
Example #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    args = parser.parse_args()

    articles, article_index = LoadArticles(args.article_glob, False)
    for a in articles:
        words = a.split()
        for w in words:
            lexicon.add(w)

    out_f = open("lexicon.txt", "a")
    for s in lexicon:
        out_f.write(s + "\n")

    out_f.close()
def main():
    input_path = "/usr1/home/anjalief/corpora/russian/yearly_mod_subs/iz_lower/2*"
    frame_to_lex = pickle.load(open("frame_to_lex_final.pickle", "rb"))

    merged_frames = get_merged_frames(frame_to_lex)

    articles,indices = LoadArticles(input_path, verbose=False)

    sample = random.sample(range(0, len(articles)), 1000)

    # grab the first 10 that fit our requirements
    globalization = []
    mitigation = []
    other = []

    for z in sample:
        s = articles[z]

        words = Counter(s.split())
        if not words["USA"] >= 2:
            continue

        global_count = sum([words[w] for w in merged_frames["Economic"]])
        miti_count = sum([words[w] for w in merged_frames["Strife"]])
        if global_count < 2 and miti_count < 2:
            other.append((s, indices[z]))
        elif global_count >=2 and miti_count < 2:
            globalization.append((s, indices[z]))
        elif global_count < 2 and miti_count >= 2:
            mitigation.append((s, indices[z]))

        if len(globalization) >= 10 and len(mitigation) >= 10 and len(other) >= 10:
            break

    print (len(globalization), len(mitigation), len(other))
    key = ['G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G',
           'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
           'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
    all_articles = globalization[:10] + mitigation[:10] + other[:10]
    shuffle = random.sample(range(0, 30), 30)
    for i in shuffle:
        print (key[i], all_articles[i][1])
    for i in shuffle:
        print ("NEW ARTICLE")
        print (all_articles[i][0].replace(".", ".\n"))
        print ("")
Example #16
0
def get_article_top_words(input_file):
    dir_name = os.path.basename(os.path.dirname(os.path.dirname(input_file)))
    base_name = os.path.join("cache", dir_name + ".counter")

    if (os.path.isfile(base_name)):
        return pickle.load(open(base_name, "rb"))

    c = Counter()
    article_counter = Counter()
    num_articles = 0
    articles, _ = LoadArticles(input_file)
    for a in articles:
        words = tokenize.word_tokenize(a)
        c.update(words)
        article_counter.update(set(words))
        num_articles += 1
    pickle.dump((c, num_articles, article_counter), open(base_name, "wb"))
    return c, num_articles, article_counter
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--country_list')
    parser.add_argument('--outfile')
    args = parser.parse_args()

    countries = get_countries(args.country_list)

    topic_to_articles = {}

    articles, article_index = LoadArticles(args.article_glob)

    incomplete_topics = set()
    for t in all_topics:
        incomplete_topics.add(t)

    for a, i in zip(articles, article_index):

        dirname = os.path.dirname(i[0])
        base_dirname = os.path.basename(dirname)

        for topic in incomplete_topics:
            # consider this article for one arbitrary topic
            if topic in base_dirname:
                words = a.split()
                if contains_2_countries(words, countries):
                    curr_list = topic_to_articles.get(topic, [])
                    curr_list.append((a, i))

                    if len(curr_list) >= MAX_PER_TOPIC:
                        incomplete_topics.remove(topic)

                    topic_to_articles[topic] = curr_list
                break
        if len(incomplete_topics) == 0:
            break

    # output each topic to its own file
    for t in topic_to_articles:
        fp = open(args.outfile + "/" + t + ".txt", "w")
        for a in topic_to_articles[t]:
            fp.write(str(a[1]) + "\n" + a[0] + "\n\n")
        fp.close()
Example #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--out_file")
    parser.add_argument("--article_glob")
    parser.add_argument("--threshold", type=float)
    parser.add_argument("--use_edit_distance", action='store_true')
    args = parser.parse_args()

    articles, _ = LoadArticles(args.article_glob)
    count_matches = 0

    distances = []
    start = timeit.default_timer()

    # if we didn't specify a threshold, we don't know it yet
    # pull a random sample of 1000 articles and hope there
    # are some duplicates in there that can help us tune
    sample = min(len(articles), SAMPLE_SIZE)
    if not args.threshold:
        indices = random.sample(range(0, len(articles)), sample)

        # self-note, original was range(0, len(reader))
        for i1 in range(0, sample - 1):
            for j1 in range(i1 + 1, sample):
                i = indices[i1]
                j = indices[j1]
                if args.use_edit_distance:
                    distance = calculate_editdistance(articles[i], articles[j])
                else:
                    distance = calculate_cosdistance(articles[i], articles[j])
                distances.append((i, j, distance))

        distances.sort(key=lambda x: x[2])
        stop = timeit.default_timer()
        print("TIME", stop - start)

        # print out lowest 100 distances
        for d in distances[0:100]:
            print(d[2])
            print(articles[d[0]])
            print(articles[d[1]])
            print("************************************************")
Example #19
0
def main():
    input_path = "/usr1/home/anjalief/corpora/russian/Izvestiia/*/*_*.txt.tok"
    sum_num_articles = 0
    sum_num_months = 0
    token_count = 0
    type_count = set()
    for filename in glob.iglob(input_path):
        sum_num_months += 1

        articles, _ = LoadArticles(filename)
        sum_num_articles += len(articles)
        for a in articles:
            words = a.lower().split()
            token_count += len(words)
            type_count = type_count.union(set(words))

    print("Average num articles", sum_num_articles / float(sum_num_months))
    print("num articles", sum_num_articles)
    print("num tokens", token_count)
    print("num types", len(type_count))
Example #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--outfile', default='ner_out.txt')
    args = parser.parse_args()

    fp = open(args.outfile, "w")
    articles,_ = LoadArticles(args.article_glob)
    # TODO: spaCy has parallelization library
    for a in articles:
        lines = a.split("\n")
        # Write countries my list found and remove from article
        raw_countries = ""
        for i in range(len(lines) - 1, -1, -1):
            if "COUNTRIES" in lines[i]:
                raw_countries = lines[i]
                del lines[i]
                break
        fp.write(raw_countries + "\n")

        a = "\n".join(lines)
        write_spacy(a, fp)
Example #21
0
def main():
    gold_vector = LoadVectors(args.gold_vectors)[0]

    dirname_to_count = {}
    topic_to_count = {}
    for t in all_topics:
        topic_to_count[t] = (0, 0)

    for filename in glob.iglob(args.per_month_glob):
        dirname = os.path.dirname(filename)
        base_dirname = os.path.basename(dirname)
        vectors = LoadVectors(filename)
        articles, _ = LoadArticles(filename.replace(".50.lda", ""), False)
        similar_articles = GetSimilarArticles(articles, vectors, gold_vector,
                                              args.similarity_threshold)
        count = dirname_to_count.get(base_dirname, (0, 0))
        dirname_to_count[base_dirname] = (count[0] + len(articles),
                                          count[1] + len(similar_articles))
        # probably a better way to do this, but whatever
        for t in all_topics:
            if t in base_dirname:
                count = topic_to_count[t]
                topic_to_count[t] = (count[0] + len(articles),
                                     count[1] + len(similar_articles))

    out_f = open(args.log_file + str(args.similarity_threshold) + ".sub.txt",
                 "w")
    for k in dirname_to_count:
        v = dirname_to_count[k]
        out_f.write(k + " " + str(v[0]) + " " + str(v[1]) + "\n")
    out_f.close()

    out_f = open(args.log_file + str(args.similarity_threshold) + ".all.txt",
                 "w")
    for k in topic_to_count:
        v = topic_to_count[k]
        out_f.write(k + " " + str(v[0]) + " " + str(v[1]) + "\n")
    out_f.close()
Example #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--country_list')
    parser.add_argument('--outfile')
    args = parser.parse_args()

    countries = get_countries(args.country_list)

    dirname_to_articles = {}

    articles, article_index = LoadArticles(args.article_glob)

    for a, i in zip(articles, article_index):

        dirname = os.path.dirname(i[0])
        base_dirname = os.path.basename(dirname)

        # we don't need any more articles on this topic
        curr_list = dirname_to_articles.get(base_dirname, [])
        if len(curr_list) >= MAX_PER_TOPIC:
            continue

        words = a.split()
        count_this_article = contains_country(words, countries)

        if count_this_article:
            curr_list.append((a, i))

            dirname_to_articles[base_dirname] = curr_list

    # output each topic to its own file
    for t in dirname_to_articles:
        fp = open(args.outfile + "/" + t + ".txt", "w")
        for a in dirname_to_articles[t]:
            fp.write(str(a[1]) + "\n" + a[0] + "\n\n")
        fp.close()
Example #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--outfile')
    args = parser.parse_args()

    fp = open(args.outfile, "w")
    headings = ["documents", "date"]
    fp.write(",".join(headings))
    fp.write("\n")
    articles, indices = LoadArticles(args.article_glob)
    for a, i in zip(articles, indices):

        counter = Counter(a.split())

        # Only run on US focused news
        if counter["USA"] < 2:
            continue

        # can probably get better topics with some better preprocessing

        # we're making a csv so we don't want commas in the actual text
        # newlines mess up the R csv reader
        a = a.replace(",", "")
        a = a.replace("\n", " ")

        # we want the year/month
        filename = os.path.basename(i[0])

        # expects filename format to be year_month.txt.tok
        # we're going to call everything the 1st of the month cause R doesn't seem to like
        # dates with just a year/month
        year_month = filename.split('.')[0]
        splits = year_month.split('_')
        fp.write(a + "," + "01/" + splits[1] + "/" + splits[0] + "\n")
    fp.close()
#!/usr/bin/env python
from article_utils import LoadArticles, LoadVectors
from collections import Counter

all = "/projects/tir1/users/anjalief/nyt_filtered/*/*.tok"

articles, indices = LoadArticles(all, verbose=False)
print "Loading done"

all_counts = Counter()
business_counts = Counter()

for article, index in zip(articles, indices):
    folder = index[0].split("/")
    words = article.split()
    if folder[len(folder) - 2] == "Business":
        business_counts.update(words)
    else:
        all_counts.update(words)


def write_counts(filename, counts):
    fp = open(filename, 'w')
    for c in counts:
        fp.write(c + " " + str(counts[c]) + "\n")
    fp.close()


write_counts("all_counts.txt", all_counts)
write_counts("business_counts.txt", business_counts)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--country_list')
    parser.add_argument('--labeled_data')
    parser.add_argument('--lemmatize_russian', action='store_true')
    args = parser.parse_args()

    stemmer = None
    if args.lemmatize_russian:
        stemmer = country_russian_stemmer()
    countries = get_countries(args.country_list, stemmer)

    if args.article_glob:
        article_to_count = {}

        articles, article_index = LoadArticles(args.article_glob)

        for a, i in zip(articles, article_index):
            words = a.split()
            count_this_article, _ = contains_country(words, countries, stemmer)
            count = 0
            if count_this_article >= 2:
                count = 1

            tup = article_to_count.get(i[0], (0, 0))
            article_to_count[i[0]] = (tup[0] + count, tup[1] + 1)

        fp = open("article_to_count.txt", 'w')
        for t in article_to_count:
            fp.write(t + " " + str(article_to_count[t][0]) + " " +
                     str(article_to_count[t][1]) + "\n")
        fp.close()

    if args.labeled_data:
        tracker = TrackCorrect()
        articles, _ = LoadArticles(args.labeled_data)
        labels = LoadGold(args.labeled_data + ".labels")
        internal_country_count = []
        external_country_count = []
        for a, l in zip(articles, labels):
            i, cl = texterra_count_countries(a)
            tracker.update(l.is_external, i >= 2)

            # count number of countries per article
            if l.is_external:
                external_country_count.append(i)
            else:
                internal_country_count.append(i)
            if not i and l.is_external:
                fp = open("false_negatives.txt", 'a+')
                fp.write(a)
                fp.write("\n******************* ")
                for c in cl:
                    fp.write(c + " ")
                fp.write("\n\n")
                fp.close()
            elif i >= 2 and not l.is_external:
                print(a, "\n")
                print(cl, "\n\n\n")

        precision, recall, accuracy, gold_external, count = tracker.get_stats()
        print(precision, recall, accuracy, gold_external, count)
        print("EXTERNAL")
        for i in external_country_count:
            print(i)

        print("INTERNAL")
        for i in internal_country_count:
            print(i)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--article_glob')
    parser.add_argument('--file_name')
    parser.add_argument('--num_samples', type=int)
    parser.add_argument('--sample_size', type=int)
    # if we're filtering, it can take forever. Instead, we downsample to
    # some multiple of how many articles we actually need, then filter those,
    # then sample again from the filtered set. This saves us from
    # running the filtering on tons of articles
    parser.add_argument('--downsample', type=int)
    parser.add_argument('--country_list')
    parser.add_argument('--russian_ner', action='store_true')
    parser.add_argument('--label_file_exists', action='store_true')
    parser.add_argument('--merge_labels', action='store_true')
    args = parser.parse_args()

    articles, article_index = LoadArticles(args.article_glob)

    # see above note on pre-sampling
    if args.downsample:
        # we zip, downsample, and unzip
        articles, article_index = zip(*random.sample(
            list(zip(articles, article_index)), args.sample_size *
            args.num_samples * args.downsample))
        print("DOWNSAMPLE", len(articles), len(article_index))

    # if the articles are already labeled, then we split the labels
    # as we split the articles (i.e. instead of the article number)
    if args.label_file_exists:
        labels = []
        for filename in glob.iglob(args.article_glob):
            l = open(filename + ".labels").readlines()
            labels += l
        article_index = labels

    # if we've given a country list, we only keep articles that have
    # a country name in them
    if args.country_list:
        stemmer = country_russian_stemmer()
        countries = get_countries(args.country_list, stemmer)

        # We need this wrappers because the filter functions take different arguments
        def include_by_country(a):
            return contains_country(a.split(), countries, stemmer)

        articles, article_index = filter_articles(articles, article_index,
                                                  include_by_country)

    # probably not going to ever filter by both things so we can do a second pass
    if args.russian_ner:
        articles, article_index = filter_articles(articles, article_index,
                                                  texterra_count_countries)

    # choose sample_size random articles
    print("POST FILTER", len(articles), len(article_index))
    ran = random.sample(list(zip(articles, article_index)),
                        args.sample_size * args.num_samples)
    index = 0
    for i in range(0, args.num_samples):
        articles_fp = open(args.file_name + str(i), 'w+')
        labels_fp = open(args.file_name + str(i) + ".labels", 'w+')

        for i in range(0, args.sample_size):
            write_article(articles_fp, labels_fp, ran[index][0], ran[index][1],
                          args.merge_labels)
            index += 1
        articles_fp.close()
        labels_fp.close()
    print("DONE", args.article_glob)
Example #27
0
            prior[w] += 1
    return result1, result2, prior


stopwords = set()
if args.stopwords:
    stopwords = LoadStopwords(args.stopwords)
else:
    print("Not using stopwords")

# this means we want to filter, only take articles that have country names
if (args.country_list):
    countries = get_countries(args.country_list)

    # special casing this for now, atm we want prior to be combined dict
    articles1, _ = LoadArticles(args.first)
    counts1, prior = LoadFilteredCounts(articles1, countries, defaultdict(int))
    articles2, _ = LoadArticles(args.second)
    counts2, prior = LoadFilteredCounts(articles2, countries, prior)

elif args.single_country:
    # take all articles with country name as corpus 1 and articles without as corpus 2
    articles1, _ = LoadArticles(args.first)
    counts1, counts2, prior = LoadSplitCounts(articles1, args.single_country)

else:
    counts1 = LoadCounts(args.first, 0, stopwords)
    counts2 = LoadCounts(args.second, 0, stopwords)
    if args.prior:
        prior = LoadCounts(args.prior, args.min_count, stopwords)
    else: