Esempio n. 1
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print "Proc:", proc_num, "Decade:", decade
        word_freqs = Counter()
        lemma_freqs = Counter()
        lemma_pos_freqs = Counter()
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print proc_num, file
                fp.readline()
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if word == None:
                        continue
                    if lemma_pos == None:
                        continue
                    word_freqs[word] += 1
                    lemma_freqs[lemma] += 1
                    lemma_pos_freqs[lemma_pos] += 1
        write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl") 
        write_pickle(lemma_freqs, OUT + "decade_freqs/" + decade + "-lemma.pkl") 
        write_pickle(lemma_pos_freqs, OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") 
def worker(proc_num, queue, window_size, type, id_map):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
            break
        print "Proc:", proc_num, "Decade:", decade
        pair_counts = Counter()
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print proc_num, file
                fp.readline()
                context = []
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if type == "word":
                        item = word
                    elif type == "lemma":
                        item = lemma
                    elif type == "lemma_pos":
                        item = lemma_pos
                    else:
                        raise Exception("Unknown type {}".format(type))
                    if item == None:
                        continue
                    context.append(id_map[item])
                    if len(context) > window_size * 2 + 1:
                        context.pop(0)
                    pair_counts = _process_context(context, pair_counts,
                                                   window_size)
        export_mat_from_dict(pair_counts, decade,
                             OUT.format(type=type, window_size=window_size))
Esempio n. 3
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
            break
        print("Proc:", proc_num, "Decade:", decade)
        word_freqs = Counter()
        lemma_freqs = Counter()
        lemma_pos_freqs = Counter()
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print(proc_num, file)
                fp.readline()
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if word == None:
                        continue
                    if lemma_pos == None:
                        continue
                    word_freqs[word] += 1
                    lemma_freqs[lemma] += 1
                    lemma_pos_freqs[lemma_pos] += 1
        write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl")
        write_pickle(lemma_freqs,
                     OUT + "decade_freqs/" + decade + "-lemma.pkl")
        write_pickle(lemma_pos_freqs,
                     OUT + "decade_freqs/" + decade + "-lemma_pos.pkl")
Esempio n. 4
0
def worker(proc_num, queue, window_size, type, id_map):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print "Proc:", proc_num, "Decade:", decade
        pair_counts = Counter()
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print proc_num, file
                fp.readline()
                context = []
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if type == "word":
                        item = word
                    elif type == "lemma":
                        item = lemma
                    elif type == "lemma_pos":
                        item = lemma_pos
                    else:
                        raise Exception("Unknown type {}".format(type))
                    if item == None:
                        continue
                    context.append(id_map[item])
                    if len(context) > window_size * 2 + 1:
                        context.pop(0)
                    pair_counts = _process_context(context, pair_counts, window_size)
        export_mat_from_dict(pair_counts, decade, OUT.format(type=type, window_size=window_size))
Esempio n. 5
0
def process_file(fp, word_dict, lemma_dict, lemma_pos_dict):
    fp.readline()
    for line in fp:
        word, lemma, lemma_pos, _ = process_lemma_line(line)
        if word == None:
            continue
        if lemma_pos == None:
            continue
        if word not in word_dict:
            id = len(word_dict)
            word_dict[word] = id
        if lemma not in lemma_dict:
            id = len(lemma_dict)
            lemma_dict[lemma] = id
        if lemma_pos not in lemma_pos_dict:
            id = len(lemma_pos_dict)
            lemma_pos_dict[lemma_pos] = id
Esempio n. 6
0
def process_file(fp, word_dict, lemma_dict, lemma_pos_dict):
    fp.readline()
    for line in fp:
        word, lemma, lemma_pos, _ = process_lemma_line(line)
        if word == None:
            continue
        if lemma_pos == None:
            continue
        if word not in word_dict:
            id = len(word_dict)
            word_dict[word] = id
        if lemma not in lemma_dict:
            id = len(lemma_dict)
            lemma_dict[lemma] = id
        if lemma_pos not in lemma_pos_dict:
            id = len(lemma_pos_dict)
            lemma_pos_dict[lemma_pos] = id
Esempio n. 7
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print("Proc:", proc_num, "Decade:", decade)
        pos_tags = collections.defaultdict(collections.Counter)
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print(proc_num, file)
                fp.readline()
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if word == None:
                        continue
                    if lemma_pos == None:
                        continue
                    pos_tags[word][lemma_pos.split("_")[1]] += 1
        write_pickle(pos_tags, OUT + str(decade) + "-pos-counts.pkl")
        pos_maj = {}
        for word, pos_counts in pos_tags.items():
            pos_maj[word] = sorted(pos_counts, key = lambda t : -1*pos_counts[t])[0]
        write_pickle(pos_maj, OUT + str(decade) + "-pos-maj.pkl")
Esempio n. 8
0
def worker(proc_num, queue):
    while True:
        try:
            decade = str(queue.get(block=False))
        except Empty:
             break
        print "Proc:", proc_num, "Decade:", decade
        pos_tags = collections.defaultdict(collections.Counter)
        for file in os.listdir(DATA + decade):
            with open(DATA + decade + "/" + file) as fp:
                print proc_num, file
                fp.readline()
                for line in fp:
                    word, lemma, lemma_pos, _ = process_lemma_line(line)
                    if word == None:
                        continue
                    if lemma_pos == None:
                        continue
                    pos_tags[word][lemma_pos.split("_")[1]] += 1
        write_pickle(pos_tags, OUT + str(decade) + "-pos-counts.pkl")
        pos_maj = {}
        for word, pos_counts in pos_tags.iteritems():
            pos_maj[word] = sorted(pos_counts, key = lambda t : -1*pos_counts[t])[0]
        write_pickle(pos_maj, OUT + str(decade) + "-pos-maj.pkl")