SEP2 = ","
SEP4 = "#"
SEP3 = ":"
STRIP_DST_SENSES = True
BABELNET_PKL = "babelnet.pkl"

# Meta-parameters of the method
REMOVE_BOW_STOPWORDS = True
LEMMATIZE_BOW = True
LOWERCASE_BOW = True


_re_norm_babel = re.compile(ur"[()_:]", re.U|re.I)
_re_norm_babel_dash = re.compile(ur"[()_:-]", re.U|re.I)
_re_whitespaces2 = re.compile(r"\s+")
_stoplist = get_stoplist()


def good_token(w):
    return (w not in _stoplist and
            not re_spaced_numbers.match(w))


class DailyLimitException(Exception):
    def __init__(self):
        pass


class BabelNet(object):
    def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False):
        self._babelnet_keys = babelnet_keys
Exemple #2
0
from os.path import splitext
from os.path import join
from jnt.common import exists
from subprocess import Popen, PIPE
import os
from os.path import splitext
from jnt.morph import get_stoplist
from jnt.patterns import re_number

ADAGRAM_VOC = "/Users/alex/tmp/adagram/HugeModel-voc.csv"
DEFAULT_MAPPING = "/Users/alex/work/joint/src/data/best-matching-out.csv"
DYLD_LIBRARY = "/Users/alex/tmp/adagram/AdaGram.jl/lib/"
ADAGRAM_SCRIPTS_DIR = "/Users/alex/work/joint/src/jnt/adagram/"

_adagram_voc = load_voc(ADAGRAM_VOC, silent=True)
_stoplist = get_stoplist()


def filter_voc(text):
    text_adagram = [w.lower() for w in text.split(" ") if w in _adagram_voc]
    return " ".join(text_adagram)


TARGET_BEG = "((("
TARGET_END = ")))"


def filter_context(context, target, remove_target, context_size):
    context = [
        w for w in context.split(" ")
        if w.strip() != "" and w not in _stoplist and not re_number.match(w)
Exemple #3
0
def generete_words(voc_fpath, freq_fpath, sc):
    output_all_fpath = join(dirname(voc_fpath), splitext(basename(voc_fpath))[0] + "-clusters.csv")

    freq = load_freq(freq_fpath, min_freq=10, preprocess=True, sep='\t',
              lowercase=False, strip_pos=False, use_pickle=True)

    # load resources
    voc = [r.word for i,r in read_csv(voc_fpath, "\t", encoding='utf8', error_bad_lines=False).iterrows()]
    res = defaultdict(Counter)
    stoplist = get_stoplist()

    # for each model retrieve related words
    for sname in sc:
        output_fpath = join(dirname(voc_fpath), splitext(splitext(basename(sname))[0])[0] + "-clusters.csv")
        with codecs.open(output_fpath, "w", "utf-8") as output:
            print >> output, "word\tnum_related\trelated"
            for w in voc:
                related_cluster = Counter()

                # calculate candidate sense words
                candidates = set()
                for sw in sc[sname].find_word(w):
                    pos = "" if len(sw.split("#")) < 2 else sw.split("#")[1].lower()
                    if pos in ["np","nn",""]: candidates.add(sw)

                print "\n\n\n======================== " + w.upper()

                # for each word candidate
                for wc in candidates:
                    if "#" in wc and freq.get(wc, 0) < MIN_SENSE_FREQ:
                        print "\n\n>>>Skipping:", w, wc, freq.get(wc, 0)
                        continue

                    # for each sense of the candidate
                    print "\n\n>>>", w, wc, freq.get(wc, 0)
                    for sense_id in sorted(sc[sname].data[wc]):

                        # build list of top related words
                        related_words = {}
                        for rw in sorted(sc[sname].data[wc][sense_id]["cluster"], key=sc[sname].data[wc][sense_id]["cluster"].get, reverse=True):
                            if len(related_words) < RELATED_PER_SENSE and re_latin_word.match(rw):
                                rw_lemma = rw.split("#")[0].lower()
                                if rw_lemma not in related_words and rw_lemma not in stoplist:
                                    related_words[rw_lemma] = sc[sname].data[wc][sense_id]["cluster"][rw]

                        related_cluster.update(related_words)
                        #print "+++++++", related_cluster
                        print sense_id,
                        for x in related_words: print x,
                        print ""

                res[w].update(related_cluster)
                related_cluster_s = sorted(related_cluster, key=related_cluster.get, reverse=True)
                print >> output, "%s\t%d\t%s" % (w, len(related_cluster), ','.join(related_cluster_s))
                print ":::%s\t%d\t%s" % (w, len(related_cluster), ','.join(related_cluster_s))


        print "\n\nOutput:", output_fpath

    # Save union of related words for all input datasets
    # related_cluster_words = take(RELATED_PER_WORD, sorted(related_cluster, key=related_cluster.get, reverse=True)
    with codecs.open(output_all_fpath, "w", "utf-8") as output:
        print >> output, "word\tnum_related\trelated"
        for w in sorted(res, key=res.get, reverse=True):
            print >> output, "%s\t%d\t%s" % (w, len(res[w]), ','.join(res[w]))


    print "\n\nOutput:", output_all_fpath