Exemple #1
0
def bag_of_words(defs, stem_flag, quiet):
    '''convert dictionary definitions into bags of words'''
    
    # convert to bag of words, count words
    
    if not quiet:
        print("Converting defs to bags of words")
    
    count = {}
    
    pr = ProgressBar(maxval = len(defs))
    
    empty_keys = set()
    
    for lemma in defs:
        pr.update(pr.currval + 1)
                
        defs[lemma] = [tesslang.standardize('any', w) 
            for w in pat.clean['any'].split(defs[lemma]) 
            if not w.isspace() and w != '']
        
        if stem_flag:
            defs[lemma] = [stem(w) for w in defs[lemma]]
        
        if len(defs[lemma]) > 0:
            for d in defs[lemma]:
                if d in count:
                    count[d] += 1
                else:
                    count[d] = 1
        else:
            empty_keys.add(lemma)    
    
    pr.finish()
    
    if not quiet:
        print("Removing hapax legomena")
    
    pr = ProgressBar(maxval = len(defs))
    
    for lemma in defs:
        pr.update(pr.currval + 1)
        
        defs[lemma] = [w for w in defs[lemma] if count[w] > 1]
        
        if defs[lemma] == []:
            empty_keys.add(lemma)
	
    pr.finish()
    
    if not quiet:
        print('Lost {0} empty definitions'.format(len(empty_keys)))
	
    for k in empty_keys:
        del defs[k]
    
    return(defs)
Exemple #2
0
def parse_stop_list(lang, name, quiet):
    '''read frequency table'''

    # open stoplist file

    filename = None

    if name == '*':
        filename = os.path.join(fs['data'], 'common', lang + '.stem.freq')
    else:
        filename = os.path.join(fs['data'], 'v3', lang, name,
                                name + '.freq_stop_stem')

    if not quiet:
        print 'Reading stoplist {0}'.format(filename)

    pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)

    try:
        f = codecs.open(filename, encoding='utf_8')
    except IOError as err:
        print "Can't read {0}: {1}".format(filename, str(err))
        sys.exit(1)

    # read stoplist header to get total token count

    head = f.readline()

    m = re.compile('#\s+count:\s+(\d+)', re.U).match(head)

    if m is None:
        print "Can't find header in {0}".format(filename)
        sys.exit(1)

    total = int(m.group(1))

    pr.advance(len(head.encode('utf-8')))

    # read the individual token counts, divide by total

    rank = {}
    n = 1

    for line in f:
        lemma, count = line.split('\t')

        lemma = tesslang.standardize(lang, lemma)
        lemma = number.sub('', lemma)

        rank[lemma] = math.log(n)

        n += 1

        pr.advance(len(line.encode('utf-8')))

    return (rank)
Exemple #3
0
def parse_stop_list(lang, name, quiet):
	'''read frequency table'''
	
	# open stoplist file
	
	filename = None
	
	if name == '*':
		filename = os.path.join(fs['data'], 'common', lang + '.stem.freq')
	else:
		filename = os.path.join(fs['data'], 'v3', lang, name, name + '.freq_stop_stem')
		
	if not quiet:
		print 'Reading stoplist {0}'.format(filename)
		
	pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)
	
	try: 
		f = codecs.open(filename, encoding='utf_8')
	except IOError as err:
		print "Can't read {0}: {1}".format(filename, str(err))
		sys.exit(1)
		
	# read stoplist header to get total token count
	
	head = f.readline()
	
	m = re.compile('#\s+count:\s+(\d+)', re.U).match(head)
	
	if m is None:
		print "Can't find header in {0}".format(filename)
		sys.exit(1)
		
	total = int(m.group(1))
	
	pr.advance(len(head.encode('utf-8')))
	
	# read the individual token counts, divide by total
	
	rank = {}
	n = 1
	
	for line in f:
		lemma, count = line.split('\t')
		
		lemma = tesslang.standardize(lang, lemma)
		lemma = number.sub('', lemma)
		
		rank[lemma] = math.log(n)
		
		n += 1
		
		pr.advance(len(line.encode('utf-8')))
		
	return(rank)
Exemple #4
0
def parse_stop_list(lang, name, quiet):
    '''read frequency table'''
    
    # open stoplist file
    
    filename = None
    
    if name == '*':
        filename = os.path.join(basedir, "data", lang + '.stem.freq')
    else:
        filename = os.path.join(basedir, name + '.freq_stop_stem')
    	
    if not quiet:
        print('Reading stoplist {0}'.format(filename))
    	
    pr = ProgressBar(maxval = os.stat(filename).st_size)
    
    try:
        f = open(filename, "r", encoding="utf_8")
    except IOError as err:
        print("Can't read {0}: {1}".format(filename, str(err)))
        sys.exit(1)
    
    # read stoplist header to get total token count
    
    head = f.readline()
    
    m = re.compile('#\s+count:\s+(\d+)').match(head)
    
    if m is None:
        print("Can't find header in {0}".format(filename))
        sys.exit(1)
    	
    total = int(m.group(1))
    
    pr.update(pr.currval + len(head.encode('utf-8')))
    
    # read the individual token counts, divide by total
    
    freq = dict()
    
    for line in f:
        
        lemma, count = line.split('\t')
        
        lemma = tesslang.standardize(lang, lemma)
        lemma = pat.number.sub('', lemma)
        
        freq[lemma] = float(count)/total
        
        pr.update(pr.currval + len(line.encode('utf-8')))
        
    pr.finish()
    
    return(freq)
def bag_of_words(defs, stem_flag, quiet=False):
    '''convert dictionary definitions into bags of words'''

    # convert to bag of words, count words

    if not quiet:
        print "Converting defs to bags of words"

    count = {}

    pr = progressbar.ProgressBar(len(defs), quiet)

    empty_keys = set()

    for lemma in defs:
        pr.advance()

        defs[lemma] = [
            tesslang.standardize('any', w)
            for w in pat.clean['any'].split(defs[lemma])
            if not w.isspace() and w != ''
        ]

        if len(defs[lemma]) > 0:
            for d in defs[lemma]:
                if d in count:
                    count[d] += 1
                else:
                    count[d] = 1
        else:
            empty_keys.add(lemma)

    if not quiet:
        print "Removing hapax legomena"

    pr = progressbar.ProgressBar(len(defs), quiet)

    for lemma in defs:
        pr.advance()

        defs[lemma] = [w for w in defs[lemma] if count[w] > 1]

        if defs[lemma] == []:
            empty_keys.add(lemma)

    if not quiet:
        print 'Lost {0} empty definitions'.format(len(empty_keys))

    for k in empty_keys:
        del defs[k]

    return (defs)
def parse_stem_dict(lang, quiet):
	'''parse the csv stem dictionaries of Helma Dik'''
	
	filename = os.path.join(fs['data'], 'common', lang + '.lexicon.csv')
	
	f = open(filename, 'r')
	
	if not quiet:
		print 'Reading lexicon {0}'.format(filename)
	
	pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)
	
	try: 
		f = codecs.open(filename, encoding='utf_8')
	except IOError as err:
		print "Can't read {0}: {1}".format(filename, str(err))
		sys.exit(1)
		
	pos = dict()
	heads = dict()
	
	for line in f:
		pr.advance(len(line.encode('utf-8')))
		
		line = line.strip().lower().replace('"', '')
		
		try:
			token, code, lemma = line.split(',')
		except ValueError:
			continue
			
		lemma = tesslang.standardize(lang, lemma)
		lemma = pat.number.sub('', lemma)
		
		if len(code) == 10:	
			if lemma in pos:
				pos[lemma].append(code[:2])
			else:
				pos[lemma] = [code[:2]]
				
		heads[lemma] = 1
		
	success = 0
	
	for lemma in heads:
		if lemma in pos:
			success += 1
			
	print 'pos success; {0}%'.format(100 * success / len(heads))
	
	return(pos)
def parse_stem_dict(lang, quiet):
	'''parse the csv stem dictionaries of Helma Dik'''
	
	filename = os.path.join(fs['data'], 'common', lang + '.lexicon.csv')
	
	f = open(filename, 'r')
	
	if not quiet:
		print 'Reading lexicon {0}'.format(filename)
	
	pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)
	
	try: 
		f = codecs.open(filename, encoding='utf_8')
	except IOError as err:
		print "Can't read {0}: {1}".format(filename, str(err))
		sys.exit(1)
		
	pos = dict()
	heads = dict()
	
	for line in f:
		pr.advance(len(line.encode('utf-8')))
		
		line = line.strip().lower().replace('"', '')
		
		try:
			token, code, lemma = line.split(',')
		except ValueError:
			continue
			
		lemma = tesslang.standardize(lang, lemma)
		lemma = pat.number.sub('', lemma)
		
		if len(code) == 10:	
			if lemma in pos:
				pos[lemma].append(code[:2])
			else:
				pos[lemma] = [code[:2]]
				
		heads[lemma] = 1
		
	success = 0
	
	for lemma in heads:
		if lemma in pos:
			success += 1
			
	print 'pos success; {0}%'.format(100 * success / len(heads))
	
	return(pos)
def bag_of_words(defs, stem_flag, quiet):
	'''convert dictionary definitions into bags of words'''
	
	# convert to bag of words, count words
	
	if not quiet:
		print "Converting defs to bags of words"
	
	count = {}
	
	pr = progressbar.ProgressBar(len(defs), quiet)
	
	empty_keys = set()
	
	for lemma in defs:
		pr.advance()
		
		defs[lemma] = [tesslang.standardize('any', w) 
							for w in pat.clean['any'].split(defs[lemma]) 
							if not w.isspace() and w != '']
				
		if len(defs[lemma]) > 0:
			for d in defs[lemma]:
				if d in count:
					count[d] += 1
				else:
					count[d] = 1
		else:
			empty_keys.add(lemma)
	
	if not quiet:
		print "Removing hapax legomena"
	
	pr = progressbar.ProgressBar(len(defs), quiet)
	
	for lemma in defs:
		pr.advance()
		
		defs[lemma] = [w for w in defs[lemma] if count[w] > 1]
		
		if defs[lemma] == []:
			empty_keys.add(lemma)
	
	if not quiet:
		print 'Lost {0} empty definitions'.format(len(empty_keys))
		
	for k in empty_keys:
		del defs[k]
	
	return(defs)
Exemple #9
0
def parse_XML_dictionaries(langs, quiet):
    '''Create a dictionary of english translations for each lemma'''
    
    defs = dict()
    
    # process latin, greek lexica in turn
    
    for lang in langs:
        filename = os.path.join(basedir, 'data', lang + '.lexicon.xml')
        
        if not quiet:
            print('Reading lexicon {0}'.format(filename))
        
        pr = ProgressBar(maxval = os.stat(filename).st_size)
        
        try: 
            f = open(filename, "r", encoding="utf_8")
        except IOError as err:
            print("Can't read {0}: {1}".format(filename, str(err)))
            sys.exit(1)
        
        #
        # Each line in the lexicon is one entry.
        # Process one at a time to extract headword, definition.
        #
        
        for line in f:
            pr.update(pr.currval + len(line.encode('utf-8')))
            
            # skip lines that don't conform with the expected entry structure
            
            m = pat.entry.search(line)
            
            if m is None:
                continue
            
            lemma, entry = m.group(1, 2)
            
            # standardize the headword
            
            lemma = pat.clean[lang].sub('', lemma)
            lemma = pat.number.sub('', lemma)
            lemma = tesslang.standardize(lang, lemma)
            
            # remove elements on the stoplist
            
            for stop in pat.stop:
                entry = stop.sub('', entry)
            
            # transliterate betacode to unicode chars
            # in foreign tags
            
            entry = pat.foreign.sub(mo_beta2uni, entry)
            
            # extract strings marked as translations of the headword
            
            def_strings = pat.definition[lang].findall(entry)
            
            # drop empty defs
            
            def_strings = [d for d in def_strings if not d.isspace()]
            
            # skip lemmata for which no translation can be extracted
            
            if def_strings is None:
                continue
            
            if lemma in defs and defs[lemma] is not None:					
                defs[lemma].extend(def_strings)
            else:
                defs[lemma] = def_strings
        
        pr.finish()
    
    if not quiet:
        print('Read {0} entries'.format(len(defs)))
        print('Flattening entries with multiple definitions')
    
    pr = ProgressBar(maxval = len(defs))
    
    empty_keys = set()
    
    for lemma in defs:
        pr.update(pr.currval + 1)
        
        if defs[lemma] is None or defs[lemma] == []:
            empty_keys.add(lemma)
            continue
        
        defs[lemma] = '; '.join(defs[lemma])
    
    pr.finish()
    
    if not quiet:
        print('Lost {0} empty definitions'.format(len(empty_keys)))
    
    for k in empty_keys:
        del defs[k]
    
    return(defs)
Exemple #10
0
def parse_XML_dictionaries(langs, quiet=False):
    '''Create a dictionary of english translations for each lemma'''

    defs = dict()

    # process latin, greek lexica in turn

    for lang in langs:
        filename = os.path.join(fs['data'], 'common', lang + '.lexicon.xml')

        if not quiet:
            print 'Reading lexicon {0}'.format(filename)

        pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet)

        try:
            f = codecs.open(filename, encoding='utf_8')
        except IOError as err:
            print "Can't read {0}: {1}".format(filename, str(err))
            sys.exit(1)

        #
        # Each line in the lexicon is one entry.
        # Process one at a time to extract headword, definition.
        #

        for line in f:
            pr.advance(len(line.encode('utf-8')))

            # skip lines that don't conform with the expected entry structure

            m = pat.entry.search(line)

            if m is None:
                continue

            lemma, entry = m.group(1, 2)

            # standardize the headword

            lemma = pat.clean[lang].sub('', lemma)
            lemma = pat.number.sub('', lemma)
            lemma = tesslang.standardize(lang, lemma)

            # remove elements on the stoplist

            for stop in pat.stop:
                entry = stop.sub('', entry)

            # transliterate betacode to unicode chars
            # in foreign tags

            entry = pat.foreign.sub(mo_beta2uni, entry)

            # extract strings marked as translations of the headword

            def_strings = pat.definition[lang].findall(entry)

            # drop empty defs

            def_strings = [d for d in def_strings if not d.isspace()]

            # skip lemmata for which no translation can be extracted

            if def_strings is None:
                continue

            if lemma in defs and defs[lemma] is not None:
                defs[lemma].extend(def_strings)
            else:
                defs[lemma] = def_strings

    if not quiet:
        print 'Read {0} entries'.format(len(defs))
        print 'Flattening entries with multiple definitions'

    pr = progressbar.ProgressBar(len(defs), quiet)

    empty_keys = set()

    for lemma in defs:
        pr.advance()

        if defs[lemma] is None or defs[lemma] == []:
            empty_keys.add(lemma)
            continue

        defs[lemma] = '; '.join(defs[lemma])

    if not quiet:
        print 'Lost {0} empty definitions'.format(len(empty_keys))

    for k in empty_keys:
        del defs[k]

    if "" in defs:
        del defs[""]

    return (defs)