SEP2 = "," SEP4 = "#" SEP3 = ":" STRIP_DST_SENSES = True BABELNET_PKL = "babelnet.pkl" # Meta-parameters of the method REMOVE_BOW_STOPWORDS = True LEMMATIZE_BOW = True LOWERCASE_BOW = True _re_norm_babel = re.compile(ur"[()_:]", re.U|re.I) _re_norm_babel_dash = re.compile(ur"[()_:-]", re.U|re.I) _re_whitespaces2 = re.compile(r"\s+") _stoplist = get_stoplist() def good_token(w): return (w not in _stoplist and not re_spaced_numbers.match(w)) class DailyLimitException(Exception): def __init__(self): pass class BabelNet(object): def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False): self._babelnet_keys = babelnet_keys
from os.path import splitext from os.path import join from jnt.common import exists from subprocess import Popen, PIPE import os from os.path import splitext from jnt.morph import get_stoplist from jnt.patterns import re_number ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv" DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv" DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/" ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/" _adagram_voc = load_voc(ADAGRAM_VOC, silent=True) _stoplist = get_stoplist() def filter_voc(text): text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc] return " ".join(text_adagram) TARGET_BEG = "(((" TARGET_END = ")))" def filter_context(context, target, remove_target, context_size): context = [ w for w in context.split(" ") if w.strip() != "" and w not in _stoplist and not re_number.match(w)
def generete_words(voc_fpath, freq_fpath, sc): output_all_fpath = join(dirname(voc_fpath), splitext(basename(voc_fpath))[0] + "-clusters.csv") freq = load_freq(freq_fpath, min_freq=10, preprocess=True, sep='\t', lowercase=False, strip_pos=False, use_pickle=True) # load resources voc = [r.word for i,r in read_csv(voc_fpath, "\t", encoding='utf8', error_bad_lines=False).iterrows()] res = defaultdict(Counter) stoplist = get_stoplist() # for each model retrieve related words for sname in sc: output_fpath = join(dirname(voc_fpath), splitext(splitext(basename(sname))[0])[0] + "-clusters.csv") with codecs.open(output_fpath, "w", "utf-8") as output: print >> output, "word\tnum_related\trelated" for w in voc: related_cluster = Counter() # calculate candidate sense words candidates = set() for sw in sc[sname].find_word(w): pos = "" if len(sw.split("#")) < 2 else sw.split("#")[1].lower() if pos in ["np","nn",""]: candidates.add(sw) print "\n\n\n======================== " + w.upper() # for each word candidate for wc in candidates: if "#" in wc and freq.get(wc, 0) < MIN_SENSE_FREQ: print "\n\n>>>Skipping:", w, wc, freq.get(wc, 0) continue # for each sense of the candidate print "\n\n>>>", w, wc, freq.get(wc, 0) for sense_id in sorted(sc[sname].data[wc]): # build list of top related words related_words = {} for rw in sorted(sc[sname].data[wc][sense_id]["cluster"], key=sc[sname].data[wc][sense_id]["cluster"].get, reverse=True): if len(related_words) < RELATED_PER_SENSE and re_latin_word.match(rw): rw_lemma = rw.split("#")[0].lower() if rw_lemma not in related_words and rw_lemma not in stoplist: related_words[rw_lemma] = sc[sname].data[wc][sense_id]["cluster"][rw] related_cluster.update(related_words) #print "+++++++", related_cluster print sense_id, for x in related_words: print x, print "" res[w].update(related_cluster) related_cluster_s = sorted(related_cluster, key=related_cluster.get, reverse=True) print >> output, "%s\t%d\t%s" % (w, len(related_cluster), ','.join(related_cluster_s)) print ":::%s\t%d\t%s" % (w, len(related_cluster), ','.join(related_cluster_s)) print "\n\nOutput:", output_fpath # Save union of related words for all input datasets # related_cluster_words = take(RELATED_PER_WORD, sorted(related_cluster, key=related_cluster.get, reverse=True) with codecs.open(output_all_fpath, "w", "utf-8") as output: print >> output, "word\tnum_related\trelated" for w in sorted(res, key=res.get, reverse=True): print >> output, "%s\t%d\t%s" % (w, len(res[w]), ','.join(res[w])) print "\n\nOutput:", output_all_fpath