Ejemplo n.º 1
0
def find_missing_word(model, line):
    words = tokenize_words(line)
    if len(words) <= 2:
        return 1
    scores = list(p for p, _, _ in model.full_scores(line))
    # missing word cannot be the first or last, per rules
    idx = np.argmin(scores[1:-2]) + 1
    return idx
def find_missing_word(model, line):
    words = tokenize_words(line)
    if len(words) <= 2: 
        return 1
    scores = list(p for p, _, _ in model.full_scores(line))
    # missing word cannot be the first or last, per rules
    idx = np.argmin(scores[1:-2]) + 1
    return idx
Ejemplo n.º 3
0
def missing_word_index(sentence, ref_sentence, lo=0, hi=None):
    '''
    Use bisection search to find the location of the missing word
    in @sentence with respect to @ref_sentence. Return the index
    of the missing word in @ref_sentence.
    '''
    words = tokenize_words(sentence)
    ref_words = tokenize_words(ref_sentence)
    assert len(words) == len(ref_words) - 1
    lo = lo if lo is not None else 0
    hi = hi if hi is not None else len(ref_words)
    i = (lo + hi) / 2
    while lo+1 < hi:
        if words[i] == ref_words[i]:
            lo = i
        else:
            hi = i
        i = (lo + hi) / 2
        
    if i < len(words) and words[i] == ref_words[i]: i += 1
    assert words[i-1] == ref_words[i-1]
    assert words[i] != ref_words[i] or i == len(words)
    return i
Ejemplo n.º 4
0
def letter_frequencies(istream, n=1):
    counts = defaultdict(int)
    nwords = 0
    for i, line in enumerate(istream):
        words = tokenize_words(line)
        nwords += len(words)
        for word in words:
            #for letter in set(window(word,n)):
            for letter in set(word):
                counts[letter] += 1
        if i % PROGRESS == 0:
            print >> sys.stderr, i

    # Normalize counts to total number of words
    for k in counts.keys():
        counts[k] /= float(nwords)
    return counts
def letter_frequencies(istream, n=1):
    counts = defaultdict(int)
    nwords = 0
    for i, line in enumerate(istream):
        words = tokenize_words(line)
        nwords += len(words)
        for word in words:
            #for letter in set(window(word,n)):
            for letter in set(word):
                counts[letter] += 1
        if i % PROGRESS == 0:
            print >>sys.stderr, i
            
    # Normalize counts to total number of words
    for k in counts.keys():
        counts[k] /= float(nwords)
    return counts
Ejemplo n.º 6
0
def make_mishnaic_training_context():
    training = []
    mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")]

    mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")]

    for ind in mishnah_indexes:
        mishna_segs = ind.all_section_refs()
        for seg in mishna_segs:
            first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True)
            training += [{'language':'mishnaic', 'phrase': util.tokenize_words(p)} for p in first_sec_str.split(u'. ')]

    total_words = 0
    total_phrases = len(training)
    for p in training:
        total_words += len(p['phrase'])

    print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases)
    return training
Ejemplo n.º 7
0
def find_missing_word(model, vocab, line, n):
    '''
    Return the location and word that maximizes the sentence probability
    if that word is inserted at that location
    '''
    words = tokenize_words(line)
    if len(words) <= 2:
        return max_prob_word_at(words, 1, vocab)

    # missing word cannot be the first or last
    top_n = TopK(n)
    for i in range(1, len(words) - 1):
        #print >>sys.stderr, "Considering words inserted at %d:" % i
        top_n_i = max_prob_word_at(words, i, vocab, n)
        #print_top_n(top_n_i)
        top_n.update(top_n_i)
        #print >>sys.stderr, "Current best:"
        #print_top_n(top_n)
    return top_n
Ejemplo n.º 8
0
def find_missing_word(model, vocab, line):
    '''
    Return the location and word that maximizes the sentence probability
    if that word is inserted at that location
    '''
    words = tokenize_words(line)
    if len(words) <= 2:
        best, _ = max_prob_word_at(words, 1, vocab)
        return 1, best

    # missing word cannot be the first or last
    max_p = -float('inf')
    best = None
    for i in xrange(1, len(words) - 1):
        i_best, i_max_p = max_prob_word_at(words, i, vocab)
        if i_max_p > max_p:
            max_p = i_max_p
            best = (i, i_best)
    return best
def find_missing_word(model, vocab, line, n):
    '''
    Return the location and word that maximizes the sentence probability
    if that word is inserted at that location
    '''
    words = tokenize_words(line)
    if len(words) <= 2: 
        return max_prob_word_at(words, 1, vocab)
    
    # missing word cannot be the first or last
    top_n = TopK(n)
    for i in xrange(1, len(words)-1):
        #print >>sys.stderr, "Considering words inserted at %d:" % i
        top_n_i = max_prob_word_at(words, i, vocab, n)
        #print_top_n(top_n_i)
        top_n.update(top_n_i)
        #print >>sys.stderr, "Current best:"
        #print_top_n(top_n)
    return top_n
def find_missing_word(model, vocab, line):
    '''
    Return the location and word that maximizes the sentence probability
    if that word is inserted at that location
    '''
    words = tokenize_words(line)
    if len(words) <= 2: 
        best, _ = max_prob_word_at(words, 1, vocab)
        return 1, best
    
    # missing word cannot be the first or last
    max_p = -float('inf')
    best = None
    for i in xrange(1, len(words)-1):
        i_best, i_max_p = max_prob_word_at(words, i, vocab)
        if i_max_p > max_p:
            max_p = i_max_p
            best = (i, i_best)
    return best
Ejemplo n.º 11
0
    def preprocess(self):
        file = open("../data/data.txt").read()
        self.processed_inputs = tokenize_words(file)
        self.chars = sorted(list(set(self.processed_inputs)))
        char_to_num = dict((c, i) for i, c in enumerate(self.chars))
        input_len = len(self.processed_inputs)
        self.vocab_len = len(self.chars)
        print("Total number of characters:", input_len)
        print("Total vocab:", self.vocab_len)

        for i in range(0, input_len - self.seq_length, 1):
            in_seq = self.processed_inputs[i:i + self.seq_length]
            out_seq = self.processed_inputs[i + self.seq_length]
            self.x_data.append([char_to_num[char] for char in in_seq])
            self.y_data.append(char_to_num[out_seq])

        n_patterns = len(self.x_data)
        self.X = np.reshape(self.x_data, (n_patterns, self.seq_length, 1))
        self.X = self.X / float(self.vocab_len)
        self.y = tf.keras.utils.to_categorical(self.y_data)
Ejemplo n.º 12
0
def remove_random_word(line):
    '''
    Remove a random word from line, attempting to remove
    contractions whole and not punctuation / numbers.
    '''
    words = tokenize_words(line)
    choices = removable_words(words)
    if len(choices) == 0:
        return line
    selected = random.choice(choices)
    if words[selected].startswith("'"):
        # second part of possessive/contraction
        words.pop(selected)
        words.pop(selected - 1)
    elif selected + 1 < len(words) and words[selected + 1].startswith("'"):
        # first part of possessive/contraction
        words.pop(selected + 1)
        words.pop(selected)
    else:  # regular word
        words.pop(selected)
    return ' '.join(words)
Ejemplo n.º 13
0
def make_mishnaic_training():
    training = []
    num_mishnah_per_mesechta = 30000  # effectively all mishnah
    mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")]

    mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")]

    mish_set = set()
    num_removed = 0
    for ind in mishnah_indexes:
        mishna_segs = ind.all_section_refs()
        if len(mishna_segs) >= num_mishnah_per_mesechta:
            mishna_segs = mishna_segs[:num_mishnah_per_mesechta]
        for seg in mishna_segs:
            first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True)
            word_list = util.tokenize_words(first_sec_str)
            for word in word_list:
                if random.random() > 0.45 and word in mish_set:
                    num_removed += 1
                    continue
                training.append({'word':word,'tag':'mishnaic'})
                mish_set.add(word)
    print "Num Mishna removed: {}".format(num_removed)
    return training
Ejemplo n.º 14
0
def load_mapping(istream):
    mapping = {}
    for line in istream:
        from_word, to_word = line.rstrip().split()
        mapping[from_word] = to_word
    return mapping


def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('sentences',
                        type=argparse.FileType('r'),
                        help='File with sentences')
    parser.add_argument('mapping',
                        type=argparse.FileType('r'),
                        help='File with word map')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    mapping = load_mapping(args.mapping)
    for i, sentence in enumerate(args.sentences):
        words = [mapping.get(w, UNKNOWN) for w in tokenize_words(sentence)]
        print ' '.join(words)

        if i % 500000 == 0:
            print >> sys.stderr, i
#!/usr/bin/env python

'''Replace words with their word2vec class'''

import sys, argparse
from util import tokenize_words, load_vocab, UNKNOWN

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('classes', type=argparse.FileType('r'),
        help='File with word2vec classes')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    print >>sys.stderr, "Loading word2vec classes"
    vocab = load_vocab(args.classes)
        
    for i, line in enumerate(sys.stdin):
        words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)]
        print ' '.join(map(str, words))
        
        if i % 100000 == 0:
            print >>sys.stderr, i
        
Ejemplo n.º 16
0
#!/usr/bin/env python
'''
Insert blanks like madlib in place of removed words
'''

import sys, argparse
from itertools import izip
from util import tokenize_words


def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('sample',
                        type=argparse.FileType('r'),
                        help='Sentences with one missing word')
    parser.add_argument('removed',
                        type=argparse.FileType('r'),
                        help='File with predicted indices of missing words')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    for sentence, i_missing in izip(args.sample, args.removed):
        words = tokenize_words(sentence)
        i_missing = int(i_missing)
        words.insert(i_missing, ' ')
        print ' '.join(words)
#!/usr/bin/env python

'''
Insert blanks like madlib in place of removed words
'''

import sys, argparse
from itertools import izip
from util import tokenize_words

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('sample', type=argparse.FileType('r'),
        help='Sentences with one missing word')
    parser.add_argument('removed', type=argparse.FileType('r'),
        help='File with predicted indices of missing words')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    for sentence, i_missing in izip(args.sample, args.removed):
        words = tokenize_words(sentence)
        i_missing = int(i_missing)
        words.insert(i_missing, ' ')
        print ' '.join(words)
        
Ejemplo n.º 18
0
    parser.add_argument('classifier', type=argparse.FileType('r'),
        help='Input pickle file with classifier to re-use')
    parser.add_argument('predictions', type=argparse.FileType('r'),
        help='Input file with predicted words and locations')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    print >>sys.stderr, "Loading test data"
    X = load(args.data)
    X = np.asarray(X, dtype=np.float32)
    X = np.nan_to_num(X)   
 
    print >>sys.stderr, "Loading classifer"
    clf = load_classifier(args.classifier)
    
    print >>sys.stderr, "Predicting decisions"
    d = clf.predict(X)
    
    print >>sys.stderr, "Performing decisions on stdin"
    for di, line, pred in izip(d, sys.stdin, args.predictions):
        pred = Prediction.parse(pred)
        words = tokenize_words(line)
        if di == 0: #  do nothing
            pass
        elif di == 1: # insert space
            words.insert(pred.location, ' ')
        else: # insert word
            words.insert(pred.location, pred.word)
        print ' '.join(words)
Ejemplo n.º 19
0
def num_tokens(line):
    words = tokenize_words(line)
    return len(words)
        help='Gold-standard POS-tagged sentences')
    parser.add_argument('errors', type=argparse.FileType('w'),
        help='Pickle file with errors broken down by POS tag')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    counts = defaultdict(lambda: defaultdict(int))
    nerrors = 0
    nsentences = 0
    
    for sentence, ref_sentence, i_removed in izip(args.sample, args.gold, args.removed):
        try:
            i_removed = int(i_removed)
            words = tokenize_words(sentence)
            ref_words = tokenize_words(ref_sentence)
            assert len(words) == len(ref_words)-1
            pos = map(pos_tag, words)
            ref_pos = map(pos_tag, ref_words)
        
            has_error = False
            for i in xrange(i_removed):
                counts[pos[i]][ref_pos[i]] += 1
                has_error |= (pos[i] != ref_pos[i])
            for i in xrange(i_removed, len(words)):
                counts[pos[i]][ref_pos[i+1]] += 1
                has_error |= (pos[i] != ref_pos[i+1])
            if has_error: 
                nerrors += 1
                sys.stdout.write(ref_sentence)
Ejemplo n.º 21
0
def match_cal_segments(mesechta):
    def merge_cal_word_objs(s, e, word_obj_list):
        obj_list = word_obj_list[s:e]
        m_word = u" ".join([o["word"] for o in obj_list])
        m_head_word = u" ".join([o["head_word"] for o in obj_list])
        m_pos_list = [o["POS"] for o in obj_list]
        m_pos = max(set(m_pos_list), key=m_pos_list.count)
        new_obj = obj_list[0].copy()
        new_obj["word"] = m_word
        new_obj["head_word"] = m_head_word
        new_obj["POS"] = m_pos
        return [
            new_obj
        ]  #returns a single element array which will replace a range s:e in the original array

    cal_lines = json.load(open(
        "data/1_cal_input/cal_lines_{}.json".format(mesechta), "r"),
                          encoding="utf8")
    #cal_pos_hashtable = json.load(open("cal_pos_hashtable.json","r"),encoding='utf8')
    dafs = cal_lines["dafs"]
    lines_by_daf = cal_lines["lines"]

    super_base_ref = Ref(mesechta)
    subrefs = super_base_ref.all_subrefs()
    ical = 0

    num_sef_words = 0
    num_cal_words = 0
    num_words_matched = 0

    for curr_sef_ref in subrefs:
        if curr_sef_ref.is_empty(): continue
        if ical >= len(dafs): break

        daf = dafs[ical]
        print "-----{} DAF {}  ({}/{})-----".format(mesechta, daf, ical,
                                                    len(dafs))

        base_tc = TextChunk(curr_sef_ref, "he",
                            "William Davidson Edition - Aramaic")
        bas_word_list = []  # re.split(r"\s+"," ".join(base_text.text))
        for segment in base_tc.text:
            bas_word_list += util.tokenize_words(segment)

        temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list]

        lines = [[word_obj["word"] for word_obj in temp_line]
                 for temp_line in lines_by_daf[ical]]
        word_obj_list = [
            word_obj for temp_line in lines_by_daf[ical]
            for word_obj in temp_line
        ]
        lines_by_str = [u' '.join(line_array) for line_array in lines]

        curr_cal_ref = Ref("{} {}".format(mesechta, daf))

        out = []
        word_for_word_se = []
        cal_words = []
        missed_words = []

        global_offset = 0
        if curr_sef_ref == curr_cal_ref:
            matched = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                lines_by_str,
                verbose=True,
                word_threshold=0.27,
                char_threshold=0.6,
                with_abbrev_matches=True,
                with_num_abbrevs=False)
            start_end_map = matched["matches"]
            abbrev_matches = matched["abbrevs"]
            abbrev_ranges = [[am.rashiRange for am in am_list]
                             for am_list in abbrev_matches]
            print u' --- '.join(
                [unicode(am) for am_list in abbrev_matches for am in am_list])
            abbrev_count = 0
            for ar in abbrev_ranges:
                abbrev_count += len(ar)
            #if abbrev_count > 0:
            #    print "GRESATLJL THNA DZEOR", abbrev_ranges
            for iline, se in enumerate(start_end_map):

                curr_cal_line = lines[iline]
                # if there is an expanded abbrev, concat those words into one element
                if len(abbrev_ranges[iline]) > 0:
                    offset = 0  # account for the fact that you're losing elements in the array as you merge them
                    abbrev_ranges[iline].sort(key=lambda x: x[0])
                    for ar in abbrev_ranges[iline]:
                        if ar[1] - ar[0] <= 0:
                            continue  #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length

                        #redefine ar by how many actual words are in the range, not just how many elements
                        start_ar = ar[0]
                        i_abbrev = start_ar
                        num_words = 0
                        while i_abbrev < len(curr_cal_line):
                            temp_w = curr_cal_line[i_abbrev]
                            num_words += len(re.split(ur'\s+', temp_w))
                            if num_words >= (ar[1] - ar[0] + 1):
                                break
                            i_abbrev += 1
                        end_ar = i_abbrev

                        ar = (start_ar, end_ar)
                        if len(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]
                               ) != len(
                                   word_obj_list[ar[0] - offset +
                                                 len(cal_words):ar[1] + 1 -
                                                 offset + len(cal_words)]):
                            #something's wrong. not sure what, but best to ignore this
                            continue
                        print u"ABBREV RANGE {} --- OFFSET {}".format(
                            ar, offset)
                        print u"CURR CAL LINE BEFORE {}".format(u','.join(
                            curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]))
                        curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [
                            u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 -
                                                    offset])
                        ]
                        print u"CURR CAL LINE AFTER {}".format(
                            curr_cal_line[ar[0] - offset])
                        print u"WORD OBJ LIST BEFORE {}".format(u','.join([
                            u'({})'.format(obj['word'])
                            for obj in merge_cal_word_objs(
                                ar[0] - offset + len(cal_words), ar[1] + 1 -
                                offset + len(cal_words), word_obj_list)
                        ]))
                        word_obj_list[ar[0] - offset + len(cal_words):ar[1] +
                                      1 - offset +
                                      len(cal_words)] = merge_cal_word_objs(
                                          ar[0] - offset + len(cal_words),
                                          ar[1] + 1 - offset + len(cal_words),
                                          word_obj_list)
                        print u"WORD OBJ LIST AFTER {}".format(
                            word_obj_list[ar[0] - offset +
                                          len(cal_words)]['word'])
                        offset += ar[1] - ar[0]
                        global_offset += offset

                cal_words += curr_cal_line
                if se[0] == -1:
                    word_for_word_se += [(-1, -1)
                                         for i in range(len(curr_cal_line))]
                    continue
                # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0)
                curr_bas_line = bas_word_list[se[0]:se[1] + 1]
                #print u'base line',u' '.join(curr_bas_line)
                matched_obj_words_base = dibur_hamatchil_matcher.match_text(
                    curr_bas_line,
                    curr_cal_line,
                    char_threshold=0.35,
                    verbose=False,
                    with_num_abbrevs=False)
                matched_words_base = matched_obj_words_base["matches"]
                word_for_word_se += [(tse[0] + se[0],
                                      tse[1] + se[0]) if tse[0] != -1 else tse
                                     for tse in matched_words_base]

            matched_word_for_word_obj = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                cal_words,
                char_threshold=0.35,
                prev_matched_results=word_for_word_se,
                boundaryFlexibility=2,
                with_num_abbrevs=False)
            matched_word_for_word = matched_word_for_word_obj["matches"]
            cal_len = len(matched_word_for_word)
            bad_word_offset = 0
            for ical_word, temp_se in enumerate(matched_word_for_word):
                if temp_se[0] == -1:
                    missed_words.append({
                        "word":
                        word_obj_list[ical_word]["word"],
                        "index":
                        ical_word
                    })
                    continue

                #dictionary juggling...
                for i in xrange(temp_se[0], temp_se[1] + 1):
                    #in case a cal_words and word_obj_list aren't the same length bc a word got split up
                    """
                    if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]:
                        if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]:
                            bad_word_offset += 1
                        continue
                    """
                    cal_word_obj = word_obj_list[ical_word].copy()
                    cal_word_obj["cal_word"] = cal_word_obj["word"]
                    temp_sef_word = temp_out[i]["word"]
                    temp_out[i] = cal_word_obj
                    temp_out[i]["class"] = "talmud"
                    temp_out[i]["word"] = temp_sef_word

            print u"\n-----\nFOUND {}/{} ({}%)".format(
                cal_len - len(missed_words), cal_len,
                (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100)
            #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words]))
            ical += 1
            num_cal_words += cal_len
            num_words_matched += (cal_len - len(missed_words))
        """
        #tag 1 pos words if still untagged
        for iwo,word_obj in enumerate(temp_out):
            word = word_obj["word"]
            if word in cal_pos_hashtable:
                if len(cal_pos_hashtable[word]) == 1:
                    temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]}
        """

        num_sef_words += len(temp_out)

        out += temp_out

        sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta),
                                                 "").encode('utf8')
        doc = {"words": out, "missed_words": missed_words}
        util.make_folder_if_need_be(
            "data/2_matched_sefaria/json/{}".format(mesechta))
        fp = codecs.open("data/2_matched_sefaria/json/{}/{}.json".format(
            mesechta, sef_daf),
                         "w",
                         encoding='utf-8')
        json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
        fp.close()

    return num_sef_words, num_cal_words, num_words_matched
Ejemplo n.º 22
0
def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('sentences',
                        type=argparse.FileType('r'),
                        help='File with sentences with <unk>')
    parser.add_argument('pos',
                        type=argparse.FileType('r'),
                        help='File with POS tags')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    for i, (sentence, pos_tags) in enumerate(izip(args.sentences, args.pos)):
        words = tokenize_words(sentence)
        pos = tokenize_words(pos_tags)

        if len(words) != len(pos):
            print >>sys.stderr, 'Sentence has %d words, but POS has %d' \
                % (len(words), len(pos))
            print >> sys.stderr, words
            print >> sys.stderr, pos
            print ' '.join(words)
            continue

        for j, word in enumerate(words):
            if word == '<unknown>':
                words[j] = pos[j]
        print ' '.join(words)
Ejemplo n.º 23
0
                        help='Pickle file with errors broken down by POS tag')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    counts = defaultdict(lambda: defaultdict(int))
    nerrors = 0
    nsentences = 0

    for sentence, ref_sentence, i_removed in zip(args.sample, args.gold,
                                                 args.removed):
        try:
            i_removed = int(i_removed)
            words = tokenize_words(sentence)
            ref_words = tokenize_words(ref_sentence)
            assert len(words) == len(ref_words) - 1
            pos = list(map(pos_tag, words))
            ref_pos = list(map(pos_tag, ref_words))

            has_error = False
            for i in range(i_removed):
                counts[pos[i]][ref_pos[i]] += 1
                has_error |= (pos[i] != ref_pos[i])
            for i in range(i_removed, len(words)):
                counts[pos[i]][ref_pos[i + 1]] += 1
                has_error |= (pos[i] != ref_pos[i + 1])
            if has_error:
                nerrors += 1
                sys.stdout.write(ref_sentence)
import sys, argparse
from itertools import izip
from util import tokenize_words, UNKNOWN

def load_mapping(istream):
    mapping = {}
    for line in istream:
        from_word, to_word = line.rstrip().split()
        mapping[from_word] = to_word
    return mapping

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('sentences', type=argparse.FileType('r'),
        help='File with sentences')
    parser.add_argument('mapping', type=argparse.FileType('r'),
        help='File with word map')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    mapping = load_mapping(args.mapping)
    for i, sentence in enumerate(args.sentences):
        words = [mapping.get(w, UNKNOWN) for w in tokenize_words(sentence)]
        print ' '.join(words)
        
        if i % 500000 == 0:
            print >>sys.stderr, i
        
Ejemplo n.º 25
0
#!/usr/bin/env python
'''Replace words with their word2vec class'''

import sys, argparse
from util import tokenize_words, load_vocab, UNKNOWN


def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('classes',
                        type=argparse.FileType('r'),
                        help='File with word2vec classes')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    print("Loading word2vec classes", file=sys.stderr)
    vocab = load_vocab(args.classes)

    for i, line in enumerate(sys.stdin):
        words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)]
        print(' '.join(map(str, words)))

        if i % 100000 == 0:
            print(i, file=sys.stderr)