Esempio n. 1
0
def bag_of_words(defs, stem_flag, quiet=False):
    '''convert dictionary definitions into bags of words'''

    # convert to bag of words, count words

    if not quiet:
        print "Converting defs to bags of words"

    count = {}

    pr = progressbar.ProgressBar(len(defs), quiet)

    empty_keys = set()

    for lemma in defs:
        pr.advance()

        defs[lemma] = [
            tesslang.standardize('any', w)
            for w in pat.clean['any'].split(defs[lemma])
            if not w.isspace() and w != ''
        ]

        if len(defs[lemma]) > 0:
            for d in defs[lemma]:
                if d in count:
                    count[d] += 1
                else:
                    count[d] = 1
        else:
            empty_keys.add(lemma)

    if not quiet:
        print "Removing hapax legomena"

    pr = progressbar.ProgressBar(len(defs), quiet)

    for lemma in defs:
        pr.advance()

        defs[lemma] = [w for w in defs[lemma] if count[w] > 1]

        if defs[lemma] == []:
            empty_keys.add(lemma)

    if not quiet:
        print 'Lost {0} empty definitions'.format(len(empty_keys))

    for k in empty_keys:
        del defs[k]

    return (defs)
Esempio n. 2
0
def export_dict(defs, name, quiet):
    '''export definitions as a text file'''

    dir_export = os.path.join(fs['data'], 'synonymy', 'dict-diagnostic')
    shutil.rmtree(dir_export)
    os.mkdir(dir_export)

    if not quiet:
        print 'Exporting plain-text definitions to {0}'.format(dir_export)

    keychar = None
    f = None

    pr = progressbar.ProgressBar(len(defs), quiet)

    for head in defs:

        if len(head) < 1:
            continue

        if head[0] != keychar:
            keychar = head[0]

            file_export = os.path.join(dir_export, keychar)

            f = open(file_export, 'a')

        f.write('{0}::{1}\n'.format(head.encode('utf8'),
                                    defs[head].encode('utf8')))

        pr.advance()

    f.close()
Esempio n. 3
0
def parse_stop_list(lang, name, quiet):
    '''read frequency table'''

    # open stoplist file

    filename = None

    if name == '*':
        filename = os.path.join(fs['data'], 'common', lang + '.stem.freq')
    else:
        filename = os.path.join(fs['data'], 'v3', lang, name,
                                name + '.freq_stop_stem')

    if not quiet:
        print 'Reading stoplist {0}'.format(filename)

    pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)

    try:
        f = codecs.open(filename, encoding='utf_8')
    except IOError as err:
        print "Can't read {0}: {1}".format(filename, str(err))
        sys.exit(1)

    # read stoplist header to get total token count

    head = f.readline()

    m = re.compile('#\s+count:\s+(\d+)', re.U).match(head)

    if m is None:
        print "Can't find header in {0}".format(filename)
        sys.exit(1)

    total = int(m.group(1))

    pr.advance(len(head.encode('utf-8')))

    # read the individual token counts, divide by total

    rank = {}
    n = 1

    for line in f:
        lemma, count = line.split('\t')

        lemma = tesslang.standardize(lang, lemma)
        lemma = number.sub('', lemma)

        rank[lemma] = math.log(n)

        n += 1

        pr.advance(len(line.encode('utf-8')))

    return (rank)
Esempio n. 4
0
def parse_stem_dict(lang, quiet):
	'''parse the csv stem dictionaries of Helma Dik'''
	
	filename = os.path.join(fs['data'], 'common', lang + '.lexicon.csv')
	
	f = open(filename, 'r')
	
	if not quiet:
		print 'Reading lexicon {0}'.format(filename)
	
	pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)
	
	try: 
		f = codecs.open(filename, encoding='utf_8')
	except IOError as err:
		print "Can't read {0}: {1}".format(filename, str(err))
		sys.exit(1)
		
	pos = dict()
	heads = dict()
	
	for line in f:
		pr.advance(len(line.encode('utf-8')))
		
		line = line.strip().lower().replace('"', '')
		
		try:
			token, code, lemma = line.split(',')
		except ValueError:
			continue
			
		lemma = tesslang.standardize(lang, lemma)
		lemma = pat.number.sub('', lemma)
		
		if len(code) == 10:	
			if lemma in pos:
				pos[lemma].append(code[:2])
			else:
				pos[lemma] = [code[:2]]
				
		heads[lemma] = 1
		
	success = 0
	
	for lemma in heads:
		if lemma in pos:
			success += 1
			
	print 'pos success; {0}%'.format(100 * success / len(heads))
	
	return(pos)
Esempio n. 5
0
def build_corpus(defs, quiet=False):
    '''Create a "corpus" of the type expected by Gensim'''

    if not quiet:
        print 'Generating Gensim-style corpus'

    pr = progressbar.ProgressBar(len(defs), quiet)

    corpus = []

    for lemma in defs:
        pr.advance()

        corpus.append(defs[lemma])

    return (corpus)
Esempio n. 6
0
def make_index(defs, quiet=False):
    '''Create two look-up tables: one by id and one by headword'''

    if not quiet:
        print 'Creating indices'

    by_word = {}
    by_id = []

    pr = progressbar.ProgressBar(len(defs), 1)

    for lemma in defs:
        pr.advance()

        by_id.append(lemma)
        by_word[lemma] = len(by_id) - 1

    return (by_word, by_id)
Esempio n. 7
0
def make_index(defs, quiet):
	'''Create two look-up tables: one by id and one by headword'''
	
	if not quiet:
		print 'Creating indices'
		
	by_word = {}
	by_id = []
	
	pr = progressbar.ProgressBar(len(defs), 1)
		
	for lemma in defs:
		pr.advance()
		
		by_id.append(lemma)
		by_word[lemma] = len(by_id) - 1
	
	# save the lookup table
	
	file_lookup_word = os.path.join(fs['data'], 'synonymy', 'lookup_word.pickle')
	
	if not quiet:
		print 'Saving index ' + file_lookup_word
	
	f = open(file_lookup_word, "w")
	pickle.dump(by_word, f)
	f.close()
	
	# save the id lookup
	
	file_lookup_id = os.path.join(fs['data'], 'synonymy', 'lookup_id.pickle')
	
	if not quiet:
		print 'Saving index ' + file_lookup_id
	
	f = open(file_lookup_id, "w")
	pickle.dump(by_id, f)
	f.close()
Esempio n. 8
0
def main():
    
    #
    # check for options
    #
    
    parser = argparse.ArgumentParser(
        description='Query the headword similarities matrix')
    parser.add_argument('-q', '--query', metavar='LANG', type=str,
            choices=["greek", "latin"], default="greek",
            help = 'Language to translate from')
    parser.add_argument('-c', '--corpus', metavar='LANG', type=str,
        choices=["greek", "latin"], default="latin",
        help = 'Language to translate to')
    parser.add_argument('-o', '--output', metavar='FILE', type=str,
        default="trans.csv", help = 'Destination file')
    parser.add_argument('-t', '--topics', metavar='N', type=int, default=0,
        help = 'Reduce to N topics using LSI; 0=disabled')
    parser.add_argument('-r', '--results', metavar="N", type=int, default=2,
        help = 'Max number of results to produce for each query')
    parser.add_argument('-w', '--weight', metavar="F", type=float, default=0,
        help = 'Weight scores by inverse log-rank, coefficient F.'
                + ' Suggested range 0-1. Default is no weighting')
    parser.add_argument('--child', metavar="I:N", type=validate_arg_child,
        default = None, help = "This is child I of N, only do part of the work")
    parser.add_argument('--quiet', action='store_const', const=1,
        help = "Don't print status messages to stderr")
    
    opt = parser.parse_args()

    #
    # load data created by read_lexicon.py
    #
        
    # the index by word
    
    by_word = load_dict('lookup_word.json', opt.quiet)
    
    # the index by id
    
    by_id = np.array(load_dict('lookup_id.json', opt.quiet))
        
    # the corpus

    corpus = load_dict("defs_bow.json", opt.quiet)
    
    #
    # use gensim to calculate similarities
    #
    #  NOTE: When you call similarities.Similarity with a value for the number
    # of similarities to calculate, the results are really different from what
    # you get if you leave that parameter out (i.e. calculate for all 
    # documents): with no number of sims, you get back a numpy array with as
    # many elements as there are documents, in an order corresponding to the
    # order of the documents in the corpus, where each element is the similarity
    # for the document in that position; if you specify a number of similarities
    # to return, then you get back a list of tuples, each tuple contains the
    # position of a document in the corpus and that document's similarity score.
    # These appear to be always in order of decreasing score.
    #
    # Older versions of this script expected the list of tuples, but didn't 
    # assume any order and re-ordered them by score. Now I've changed it to
    # expect the numpy array. Note that thanks to numpy you can use the array
    # like a vector in R. For example,
    #   sims = sims[filter]
    # subsets the sims array using another array, this time of boolean values.
    # Likewise, 
    #   sims -= np.absolute((rank[q_id] - rank[filter]) * opt.weight)
    # Subtracts from every element of sims the difference between one specific
    # rank, that of document q_id, and each element of array rank in turn.
    # The arrays rank and sims, each subset by filter, have the same number of
    # elements.
    #
    # I'm just writing this note to myself because I'm really new at this numpy
    # stuff and I might forget what I've done here otherwise. Delete this if
    # you like, later.
    
    # create dictionary
    
    if not opt.quiet:
        print 'Creating dictionary'
    
    dictionary = corpora.Dictionary(corpus)
    
    # convert each sample to a bag of words
    
    if not opt.quiet:
        print 'Converting each doc to bag-of-words'
    
    corpus = [dictionary.doc2bow(doc) for doc in corpus]
    
    # calculate tf-idf scores
    
    if not opt.quiet:
        print 'Creating tf-idf model'
    
    tfidf = models.TfidfModel(corpus)
    
    if not opt.quiet:
        print 'Transforming the corpus to tf-idf'
    
    corpus_tfidf = tfidf[corpus]
        
    # perform lsi transformation
    
    corpus_final = corpus_tfidf
    
    if opt.topics > 0:
        if not opt.quiet:
            print 'Performing LSI with {0} topics'.format(opt.topics)
    
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=opt.topics)
        
        corpus_final = lsi[corpus_tfidf]
    
    # calculate similarities
    
    if not opt.quiet:
        print 'Calculating similarities (please be patient)'
        
    dir_calc = os.path.join(fs['data'], 'synonymy', 'sims')
    
    index = similarities.Similarity(dir_calc, corpus_final, len(corpus_final))

    # consider frequency distribution
    
    rank = load_ranks(by_id, opt.quiet)

    # determine translation candidates, write output
    
    file_out = codecs.open(opt.output, "w", encoding="utf_8")
    
    if not opt.quiet:
        print 'Writing translation candidates to {0}'.format(opt.output)
    
    # optional filter by language
    
    filter = np.array([r is not None for r in rank])
    if (opt.corpus == "latin"):
        filter = filter & np.invert(np.array([is_greek(lem) for lem in by_id]))
    elif (opt.corpus == "greek"):
        filter = filter & np.array([is_greek(lem) for lem in by_id])
	         
    # take each headword in turn as a query
    
    pr = progressbar.ProgressBar(len(by_word), opt.quiet)
    
    results = []
    
    for q_id, sims in enumerate(index):
        pr.advance()
        
        q = by_id[q_id]
        
        if opt.query == "greek" and not is_greek(q):
            continue
        if opt.query == "latin" and is_greek(q):
            continue
        if rank[q_id] is None:
            continue

        # if child, only do every ith query

        if opt.child is not None:
            child_id, nchildren = opt.child
            
            if q_id % nchildren != child_id % nchildren:
                continue
            
        # add query word to filter
        filter[q_id] = False

        # apply filter
        sims = sims[filter]
    
        # apply distribution difference metric
        sims -= np.absolute((rank[q_id] - rank[filter]) * opt.weight)
    
        # add result words and sort by score
        sims = zip(by_id[np.arange(len(by_id))][filter], sims)
        sims = sorted(sims, key=lambda res: res[1], reverse=True)
        	
        results = [u"{0}:{1}".format(res, sim) for res, sim in sims[:opt.results]]
        file_out.write(u"{0},".format(q))
        file_out.write(u",".join(results))
        file_out.write(u"\n")
    
    file_out.close()
Esempio n. 9
0
def parse_XML_dictionaries(langs, quiet=False):
    '''Create a dictionary of english translations for each lemma'''

    defs = dict()

    # process latin, greek lexica in turn

    for lang in langs:
        filename = os.path.join(fs['data'], 'common', lang + '.lexicon.xml')

        if not quiet:
            print 'Reading lexicon {0}'.format(filename)

        pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)

        try:
            f = codecs.open(filename, encoding='utf_8')
        except IOError as err:
            print "Can't read {0}: {1}".format(filename, str(err))
            sys.exit(1)

        #
        # Each line in the lexicon is one entry.
        # Process one at a time to extract headword, definition.
        #

        for line in f:
            pr.advance(len(line.encode('utf-8')))

            # skip lines that don't conform with the expected entry structure

            m = pat.entry.search(line)

            if m is None:
                continue

            lemma, entry = m.group(1, 2)

            # standardize the headword

            lemma = pat.clean[lang].sub('', lemma)
            lemma = pat.number.sub('', lemma)
            lemma = tesslang.standardize(lang, lemma)

            # remove elements on the stoplist

            for stop in pat.stop:
                entry = stop.sub('', entry)

            # transliterate betacode to unicode chars
            # in foreign tags

            entry = pat.foreign.sub(mo_beta2uni, entry)

            # extract strings marked as translations of the headword

            def_strings = pat.definition[lang].findall(entry)

            # drop empty defs

            def_strings = [d for d in def_strings if not d.isspace()]

            # skip lemmata for which no translation can be extracted

            if def_strings is None:
                continue

            if lemma in defs and defs[lemma] is not None:
                defs[lemma].extend(def_strings)
            else:
                defs[lemma] = def_strings

    if not quiet:
        print 'Read {0} entries'.format(len(defs))
        print 'Flattening entries with multiple definitions'

    pr = progressbar.ProgressBar(len(defs), quiet)

    empty_keys = set()

    for lemma in defs:
        pr.advance()

        if defs[lemma] is None or defs[lemma] == []:
            empty_keys.add(lemma)
            continue

        defs[lemma] = '; '.join(defs[lemma])

    if not quiet:
        print 'Lost {0} empty definitions'.format(len(empty_keys))

    for k in empty_keys:
        del defs[k]

    if "" in defs:
        del defs[""]

    return (defs)
Esempio n. 10
0
def main():

    #
    # check for options
    #

    parser = argparse.ArgumentParser(
        description='Query the headword similarities matrix')
    parser.add_argument('-n',
                        '--results',
                        metavar='N',
                        default=2,
                        type=int,
                        help='Display top N results')
    parser.add_argument(
        '-t',
        '--translate',
        metavar='MODE',
        default=1,
        type=int,
        help='Translation mode: 1=Greek to Latin; 2=Latin to Greek')
    parser.add_argument('-l',
                        '--lsi',
                        action='store_const',
                        const=1,
                        help='Use LSI to reduce dimensionality')
    parser.add_argument('-f',
                        '--feature',
                        metavar="FEAT",
                        default='trans2',
                        type=str,
                        help='Name of feature dictionary to create')
    parser.add_argument('-c',
                        '--cutoff',
                        metavar='C',
                        default=None,
                        type=float,
                        help='Similarity threshold for synonymy (range: 0-1)')
    parser.add_argument('-w',
                        '--weighted',
                        action='store_const',
                        const=1,
                        help='Weight results by rank difference from query')
    parser.add_argument('--scores',
                        action='store_const',
                        const=1,
                        help='Export scores along with translations')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_const',
                        const=1,
                        help="Don't print status messages to stderr")

    opt = parser.parse_args()

    if opt.translate not in [1, 2]:
        opt.translate = 0

    #
    # load data created by read_lexicon.py
    #

    # the text-only defs

    # global full_def
    #
    # full_def = load_dict('full_defs.pickle', opt.quiet)

    # the index by word

    global by_word

    by_word = load_dict('lookup_word.pickle', opt.quiet)

    # the index by id

    global by_id

    by_id = load_dict('lookup_id.pickle', opt.quiet)

    # the corpus

    global corpus

    if opt.lsi is None:
        file_corpus = os.path.join(fs['data'], 'synonymy',
                                   'gensim.corpus_tfidf.mm')
    else:
        file_corpus = os.path.join(fs['data'], 'synonymy',
                                   'gensim.corpus_lsi.mm')

    if not opt.quiet:
        print 'Loading corpus ' + file_corpus

    corpus = corpora.MmCorpus(file_corpus)

    # the similarities index

    global index

    file_index = os.path.join(fs['data'], 'synonymy', 'gensim.index')

    if not opt.quiet:
        print 'Loading similarity index ' + file_index

    index = similarities.Similarity.load(file_index)

    # optional: consider frequency distribution

    global rank

    if opt.weighted == 1:
        rank = dict(parse_stop_list('la', '*', opt.quiet),
                    **parse_stop_list('grc', '*', opt.quiet))

    #
    # determine translation candidates, write output
    #

    if not opt.quiet:
        print 'Exporting dictionary'

    filename_csv = os.path.join(fs['data'], 'synonymy', opt.feature + '.csv')

    file_output = codecs.open(filename_csv, 'w', encoding='utf_8')

    pr = progressbar.ProgressBar(len(by_word), opt.quiet)

    # take each headword in turn as a query

    for q in by_word:
        pr.advance()

        if opt.translate and (is_greek(q) == opt.translate - 1):
            continue

        if (q not in by_word):
            continue

        # query the similarity matrix

        sims = get_results(q)

        # filter out query word, query language

        sims = filter_results(sims, q, opt.translate)

        # optional: apply distribution difference metric

        if opt.weighted == 1:
            sims = apply_freq_diff(sims, q)

        # keep only the best results, top n or above cutoff

        sims = cull(sims, opt.results, opt.cutoff)

        # print row

        export_row(file_output, q, sims, opt.scores)