def get_common_vocab(in_dir, out_dir, out_file_name, in_suffix, years, n_vocab, donor_path, receptor_path):
    common_vocab = None
    for year in years:
        col1, col2 = ioutils.load_word_pairs(in_dir + str(year) + in_suffix)
        file_vocab = set(col1)
        #file_vocab = set(read_corpus_to_list(in_dir + str(year) + in_suffix))
        # f = open(in_dir + str(year) + in_suffix)
        # for line in f:
        #     for sent in nltk.sent_tokenize(line):
        #         for word in nltk.word_tokenize(sent):
        #             file_vocab.add(word)
        if common_vocab is None:
            common_vocab = file_vocab
        else:
            common_vocab = common_vocab & file_vocab
        # f.close()
    data_bern = bernoulli.rvs(size=len(common_vocab), p=float(n_vocab) / len(common_vocab))

    common_vocab_list = list(common_vocab)
    random_common_vocab = set()
    for idx, i in enumerate(data_bern):
        if i == 1:
            random_common_vocab.add(common_vocab_list[idx])
    random_common_vocab = random_common_vocab.union(set(ioutils.load_word_list(donor_path)).union(ioutils.load_word_list(receptor_path)))
    ioutils.write_list(out_dir + out_file_name, list(random_common_vocab))
Beispiel #2
0
def numpy2text(vec_path, year, extension):
    vocab_list = load_pickle(vec_path + str(year) + "-vocab.pkl")
    w_mat = np.load(vec_path + str(year) + "-w.npy")
    vocab_size = len(vocab_list)
    dim = len(w_mat[0])
    ioutils.write_list(vec_path + str(year) + ".vocab", vocab_list)
    with open(vec_path + str(year) + "-w" + extension, "w") as fp:
        print >> fp, str(vocab_size), str(dim)
        for i, w in enumerate(vocab_list):
            print >> fp, w.encode("utf-8"), " ".join(map(str, w_mat[i, :]))
Beispiel #3
0
def worker(proc_num, queue, out_dir, input_dir, out_suffix):
    while True:
        if queue.empty():
            break
        year = queue.get()

        print proc_num, "Cleaning vocab of year", year
        vocab_list = ioutils.load_pickle(input_dir + VOCAB_FILE.format(year=year))
        cleaned_vocab_list = remove_non_alph(vocab_list)
        ioutils.write_list(out_dir + str(year) + out_suffix, cleaned_vocab_list)
def prepare_donor_receptor_lists(raw_data_file, words_pos_file, donor_out_file,
                                 receptor_out_file, n, same_pos):
    print "Reading words and pos tags"
    word_pos_df = pd.read_csv(words_pos_file)
    donor_list, receptor_candidate_list = get_donor_receptor_candidate(
        raw_data_file, n)
    receptor_list = get_receptor_same_pos(donor_list, receptor_candidate_list, word_pos_df) if same_pos \
        else get_receptor_no_pos(donor_list, receptor_candidate_list)
    print "Dumping donor and receptor lists"
    ioutils.write_list(donor_out_file, donor_list)
    ioutils.write_list(receptor_out_file, receptor_list)
def get_common_vocab(in_dir, out_dir, out_file_name, in_suffix, years):
    common_vocab = None
    for year in years:
        file_vocab = set(read_corpus_to_list(in_dir + str(year) + in_suffix))
        # f = open(in_dir + str(year) + in_suffix)
        # for line in f:
        #     for sent in nltk.sent_tokenize(line):
        #         for word in nltk.word_tokenize(sent):
        #             file_vocab.add(word)
        if common_vocab is None:
            common_vocab = file_vocab
        else:
            common_vocab = common_vocab & file_vocab
        # f.close()

    ioutils.write_list(out_dir + out_file_name, list(common_vocab))
Beispiel #6
0
def main(args):
    encoding = sys.stdout.encoding or 'utf-8'
    f = open(args.filename)
    fd = nltk.FreqDist()
    stopWords = set(stopwords.words('english'))
    for line in f:
        for sent in nltk.sent_tokenize(line):
            for word in nltk.word_tokenize(sent):
                fd[word] += 1

    words_freq = []
    for w, count in fd.most_common():
        if w not in stopWords:
            tup = u"{} {}".format(w, count)
            words_freq.append(tup.encode(encoding))
    ioutils.write_list(args.filename + "_freq", words_freq)
def create_comman_vocab(in_dir, ngram_file_suffix, out_dir, out_file_name,
                        years, lang):
    stop_set = set(stopwords.words(lang))
    common_vocab_set = set()

    for year in years:
        file_content_list = ioutils.load_word_list(in_dir + str(year) +
                                                   ngram_file_suffix)
        words_set = set()
        for line in file_content_list:
            words_line = line.split()
            for w in words_line:
                if not (w.lower().isalpha()) or (w.lower() in stop_set) or (
                        w.lower() in words_set) or (len(w) <= 2):
                    continue
                words_set.add(w.lower())
        # print words_set

        if year != years[0]:
            common_vocab_set = common_vocab_set.intersection(words_set)
        else:
            common_vocab_set = words_set
    # print list(common_vocab_set)
    ioutils.write_list(out_dir + out_file_name, list(common_vocab_set))