Ejemplo n.º 1
0
def get_word_count(input_file,
                   filter_stopwords=True,
                   ngrams=1,
                   bigram_dict=None,
                   words_func=None):
    result = collections.defaultdict(int)
    for data in utils.read_json_list(input_file):
        words = words_func(data["text"],
                           ngrams=ngrams,
                           filter_stopwords=filter_stopwords,
                           bigram_dict=bigram_dict)
        for w in words:
            result[w] += 1
    return result
Ejemplo n.º 2
0
def load_doc_topics(input_file, doc_topic_file, threshold=0.01):
    """Load topics in each document"""
    articles = []
    with open(doc_topic_file) as tfin:
        for data in utils.read_json_list(input_file):
            topic_line = tfin.readline()
            if not topic_line:
                break
            ideas = topic_line.strip().split()[2:]
            ideas = set(
                [i for (i, v) in enumerate(ideas) if float(v) > threshold])
            articles.append(
                utils.IdeaArticle(fulldate=int(data["date"]), ideas=ideas))
    return articles
Ejemplo n.º 3
0
def convert_word_count_mallet(word_dict,
                              input_file,
                              output_file,
                              words_func=None):
    doc_id = 0
    with open(output_file, "w") as fout:
        for data in utils.read_json_list(input_file):
            doc_id += 1
            words = collections.Counter(words_func(data["text"]))
            words = [(word_dict[w], words[w]) for w in words if w in word_dict]
            words.sort()
            word_cnts = [" ".join([str(wid)] * cnt) for (wid, cnt) in words]
            fout.write("%s %s %s\n" %
                       (doc_id, data["date"], " ".join(word_cnts)))
Ejemplo n.º 4
0
def load_word_articles(input_file, vocab_file, data_dir, vocab_size=100):
    articles = []
    word_map = utils.read_word_dict(vocab_file, vocab_size=vocab_size)
    word_set = utils.get_reverse_dict(word_map)
    bigram_file = "%s/bigram_phrases.txt" % data_dir
    bigram_dict = wc.load_bigrams(bigram_file)
    words_func = functools.partial(wc.get_mixed_tokens,
                                   bigram_dict=bigram_dict)
    for data in utils.read_json_list(input_file):
        words = words_func(data["text"])
        words = set([word_set[w] for w in words if w in word_set])
        articles.append(
            utils.IdeaArticle(fulldate=int(data["date"]), ideas=words))
    return articles, word_set, word_map
Ejemplo n.º 5
0
def convert_word_count_mallet(word_dict, input_file, output_file,
                              words_func=None):
    doc_id = 0
    if not os.path.exists(output_file):
        with open(output_file, "w") as fout:
            for data in utils.read_json_list(input_file):
                doc_id += 1
                words = collections.Counter(words_func(data["text"]))
                words = [(word_dict[w], words[w])
                        for w in words if w in word_dict]
                words.sort()
                word_cnts = [" ".join([str(wid)] * cnt) for (wid, cnt) in words]
                fout.write("%s %s %s\n" % (doc_id, data["date"], " ".join(word_cnts)))
    else:
        print("convert_word_count_mallet: output file found at: {}, skipping".format(output_file))
Ejemplo n.º 6
0
def load_doc_topics(input_file, doc_topic_file, threshold=0.01):
    """Load topics in each document"""
    articles = []
    # fd = open(doc_topic_output_file, "w")
    # print("opening {}".format(doc_topic_output_file))
    with open(doc_topic_file) as tfin:
        for data in utils.read_json_list(input_file):
            topic_line = tfin.readline()
            if not topic_line:
                break
            ideas = topic_line.strip().split()[2:]
            ideas = set([i for (i, v) in enumerate(ideas)
                         if float(v) > threshold])
            articles.append(utils.IdeaArticle(fulldate=int(data["date"]),
                                         ideas=ideas))
    #         fd.write('{},"{}"\n'.format(int(data["date"]), list(ideas)))
    #         print('{},"{}"\n'.format(int(data["date"]), list(ideas)))
    # fd.close()
    return articles
Ejemplo n.º 7
0
def preprocess_input(input_file, output_file, func=tokenize):
    data = []
    for d in utils.read_json_list(input_file):
        d["text"] = " ".join(func(d["text"]))
        data.append(d)
    utils.write_json_list(output_file, data)