Example #1
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.out_fn = self.cfg.get("machine", "ext_definitions")
     ensure_dir(os.path.dirname(self.out_fn))
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.lemmatizer = Lemmatizer(cfg)
Example #2
0
def prepare_articles(articles, from_cache=False):
    texts = []
    lemmatizer = Lemmatizer()
    german_stop_words = stopwords.words('german')
    filename = "data/lda-trainingdata.pickle"
    if from_cache:
        with open(filename, 'rb') as file:
            texts = pickle.load(file)
            return texts
    else:
        # Remove '... [+ xxx chars]' pattern from 'content'
        for article in progressbar(articles):
            article_text = ""
            for text in [article.description, article.title, article.fulltext if article.fulltext else article.content]:
                if text:
                    text = re.sub('\[.*?\]', '', text)
                    text = " ".join([x for x in text.split() if x.isalnum() or '.' in x])
                    article_text += lemmatizer.lemmatize_text(text=text, verbose=False)

            article_text = [x for x in article_text.split() if x not in german_stop_words]
            texts.append(article_text)

        # Cache lda-trainingdata
        if not os.path.exists("data"):
            os.makedirs("data")
        with open(filename, 'wb') as file:
            pickle.dump(texts, file)

    return texts
Example #3
0
    def __init__(self, cfg):
        try:
            self.batch = cfg.getboolean('similarity_machine', 'batch')
        except NoSectionError:
            self.batch = False

        self.cfg = cfg
        self.lemmatizer = Lemmatizer(cfg)
        self.machine_wrapper = MachineWrapper(cfg)
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
Example #4
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.out_fn = self.cfg.get("machine", "definitions_binary_out")
     ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.word2lemma = {}
Example #5
0
    def add_string(self, string, encoded_from=None, train=None): 
        """Add string to mapping and return id and optionally character id.
        Arguments:
            string: string.
            encoded_from: string, only used if is_encoded == True or
                is_encoded == None (undecided).
            train: Train mapping. If given, the words and alphabets are reused
                from the train mapping.
        Returns:
            If characters are allowed, a tuple (string id, character id).
            Otherwise only string id.
        """

        # Store strings when is_encoded == None
        if self.is_encoded is None:
            self.strings_original.add(string)

        # Encode string with lemma rule
        if self.is_encoded == None or self.is_encoded == True:
            # Do not encode special labels
            if not string in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]:
                encoded_string = Lemmatizer.gen_absolute_lemma_rule(encoded_from, string)
                if encoded_string in self.strings_map:
                    string = encoded_string
                else:
                    string = Lemmatizer.gen_lemma_rule(encoded_from, string)

        # Word-level information
        if string not in self.strings_map:
            if train:
                string = '<unk>'
            else:
                self.strings_map[string] = len(self.strings)
                self.strings.append(string)
        
        if self._include_characters:
            # Character-level information
            if string not in self.charseqs_map:
                self.charseqs_map[string] = len(self.charseqs)
                self.charseqs.append([])
                for c in string:
                    if c not in self.alphabet_map:
                        if train:
                            c = '<unk>'
                        else:
                            self.alphabet_map[c] = len(self.alphabet)
                            self.alphabet.append(c)
                    self.charseqs[-1].append(self.alphabet_map[c])

        return (self.strings_map[string], self.charseqs_map[string]) if self._include_characters else self.strings_map[string]
Example #6
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if (not direct_parse):
         self.out_fn = self.cfg.get("machine", "definitions_binary_out")
         ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.read_dep_map(dep_map_fn)
     self.word2lemma = {}
     self.first_only = cfg.getboolean('filter', 'first_only')
Example #7
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.out_fn = self.cfg.get("machine", "ext_definitions")
     ensure_dir(os.path.dirname(self.out_fn))
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.lemmatizer = Lemmatizer(cfg)
Example #8
0
 def parse_batch(article: WikiArticle,
                 lemmatizer: Lemmatizer = self.l) -> List:  # noqa
     data = []
     for idx, (lemmas, tokens, sentence) in enumerate(
             lemmatizer.lemmatize(article.text)):
         data.append("{}\t{}\t{}\t{}\n".format(article.title_id, idx,
                                               sentence,
                                               " ".join(lemmas)))
     return data
 def preprocessing(self, text, lang):
     '''
     melakukan tokenisasi text menjadi beberapa kata dan kalimat
     '''
     self.stop_words = stopwords.words(lang) + list(punctuation)
     if lang == 'indonesian':
         self.lmm = Lemmatizer()
     elif lang == 'english':
         self.lmm = WordNetLemmatizer()
     self.tokenized_sent = list(set(sent_tokenize(text)))
Example #10
0
    def __init__(self, cfg, cfg_section='word_sim'):
        self.batch = cfg.getboolean(cfg_section, 'batch')

        logging.warning("fourlangpath is {0}".format(
            cfg.get(cfg_section, 'fourlangpath')))
        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
        self.sim_feats = SimFeatures(cfg, cfg_section)
        self.expand = cfg.getboolean(cfg_section, "expand")
        logging.info("expand is {0}".format(self.expand))
Example #11
0
    def from_absolute_encodings(mapping, is_encoded):
        assert mapping.is_encoded is None or mapping.is_encoded == True
        assert is_encoded == True

        new = Mapping(include_characters=mapping._include_characters, is_encoded=True, train=None)
        for key in mapping.strings_map:
            if not key in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]:
                if Lemmatizer.is_absolute_lemma_rule(key):
                    new.strings_map[key] = len(new.strings)
                    new.strings.append(key)
        return new
Example #12
0
    def parse_sentence(self, s):
        keywords = []

        # Lemmatize sentence and only keep verbs, nouns, dates and PTs
        l = Lemmatizer()
        lemmas = l.lemmatize(s)
        lemmas = l.filter(lemmas, ['V', 'N', 'W', 'PT'])

        # Normalize lemmas
        for l in lemmas:
            if l['tag'] == 'W':
                norm_lemma = l['lemma']
            else:
                norm_lemma = self.normalize(l['lemma'])

            if len(norm_lemma) > 0 and norm_lemma not in ignore_lemmas:
                keywords.append(norm_lemma)

        self.vprint("Keywords: ", keywords)

        return [self.crawler.getwordid(word) for word in keywords]
Example #13
0
    def parse_sentence(self, s):
        keywords = []

        # Lemmatize sentence and only keep verbs, nouns, dates and PTs
        l = Lemmatizer()
        lemmas = l.lemmatize(s)
        lemmas = l.filter(lemmas, ['V', 'N', 'W', 'PT'])

        # Normalize lemmas
        for l in lemmas:
            if l['tag'] == 'W':
                norm_lemma = l['lemma']
            else:
                norm_lemma = self.normalize(l['lemma'])

            if len(norm_lemma) > 0 and norm_lemma not in ignore_lemmas:
                keywords.append(norm_lemma)

        self.vprint("Keywords: ", keywords)

        return [self.crawler.getwordid(word) for word in keywords]
Example #14
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.out_fn = self.cfg.get("machine", "definitions_binary_out")
     ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.word2lemma = {}
Example #15
0
    def id_to_string(self, string_id, encoded_from=None, train=None):
        """Returns decoded string from int id."""

        string = self.strings[string_id]

        # Special strings are not encoded
        if string in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]:
            return string
        else:
            # Decode string with lemma rule
            if self.is_encoded == None or self.is_encoded == True:
                return Lemmatizer.apply_lemma_rule(encoded_from, string)
            else:
                return string
Example #16
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if(not direct_parse):
         self.out_fn = self.cfg.get("machine", "definitions_binary_out")
         ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.read_dep_map(dep_map_fn)
     self.word2lemma = {}
     self.first_n = cfg.getint('filter', 'first_n')
     self.graph_dir = self.cfg.get('machine', 'graph_dir')
     ensure_dir(self.graph_dir)
Example #17
0
    def __init__(self, cfg, cfg_section="word_sim"):
        try:
            self.batch = cfg.getboolean(cfg_section, "batch")
        except NoSectionError:
            self.batch = False

        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words("english"))
        self.expand = cfg.getboolean(cfg_section, "expand")
        logging.info("expand is {0}".format(self.expand))
Example #18
0
    def process(filename):
        global lemmatizer
        lemmatizer = Lemmatizer()

        raw_tweets = Tokenizer.parse(filename)

        raw_tweets, hashtags = Tokenizer.extract_hashtags(raw_tweets)

        raw_tweets, mentions = Tokenizer.extract_mentions(raw_tweets)

        raw_tweets, emojis = Tokenizer.extract_emojis(raw_tweets)

        tweets = []

        for text, hashtag, mention, emoji in zip(raw_tweets, hashtags, mentions, emojis):
            tweets.append(Tweet(Tokenizer.tokenize(
                text), hashtag, mention, emoji))

        return tweets
Example #19
0
    def __init__(self, cfg, cfg_section='word_sim'):
        self.batch = cfg.getboolean(cfg_section, 'batch')

        logging.warning("fourlangpath is {0}".format(
            cfg.get(cfg_section, 'fourlangpath')))
        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
        self.sim_feats = SimFeatures(cfg, cfg_section, self.lexicon)
        self.expand = cfg.getboolean(cfg_section, "expand")
        compositional = cfg.getboolean('similarity', 'compositional')
        if compositional is True:
            self.text_to_4lang = TextTo4lang(cfg, direct_parse=True)
        logging.info("expand is {0}".format(self.expand))
        self.allow_4lang = cfg.getboolean('machine', 'allow_4lang')
Example #20
0
def test_usage():

    with timing('Loading dictionary entries'):
        dict = load_dict('data/dict/polimorf-20190818.tab', limit=5000)

    with timing('Loading word vectors'):
        word_vectors = KeyedVectors.load_word2vec_format(
            'data/nkjp+wiki-forms-all-300-skipg-ns.txt', limit=5000)

    with timing('Initializing POS tagger'):
        posTagger = Lemmatizer.create(dict, word_vectors)
        posTagger.load_model('data/disambiguation.h5')

    text = '5 kilogramów pomidorów trafiło do kuchnii. Zostały ugotowane na miękko.'
    chunks = tokenize(text)
    assert chunks[0].tokens[1].orth == 'kilogramów'

    print(chunks)

    posTagger.tag(chunks)
    assert chunks[0].tokens[1].disamb_lemma == 'kilogram'
    assert chunks[0].tokens[1].disamb_tag.startswith('noun')

    print(chunks)
Example #21
0
class DepTo4lang():

    dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)")

    def __init__(self, cfg):
        self.cfg = cfg
        self.lang = self.cfg.get("deps", "lang")
        self.out_fn = self.cfg.get("machine", "definitions_binary_out")
        ensure_dir(os.path.dirname(self.out_fn))
        self.dependency_processor = DependencyProcessor(self.cfg)
        dep_map_fn = cfg.get("deps", "dep_map")
        self.read_dep_map(dep_map_fn)
        self.undefined = set()
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.word2lemma = {}

    def read_dep_map(self, dep_map_fn):
        self.dependencies = defaultdict(list)
        for line in file(dep_map_fn):
            l = line.strip()
            if not l or l.startswith('#'):
                continue
            dep = Dependency.create_from_line(l)
            self.dependencies[dep.name].append(dep)

    def apply_dep(self, dep, machine1, machine2):
        dep_type = dep['type']
        msd1 = dep['gov'].get('msd')
        msd2 = dep['dep'].get('msd')
        if dep_type not in self.dependencies:
            if dep_type not in self.undefined:
                self.undefined.add(dep_type)
                logging.warning(
                    'skipping dependency not in dep_to_4lang map: {0}'.format(
                        dep_type))
            return False  # not that anyone cares
        for dep in self.dependencies[dep_type]:
            dep.apply(msd1, msd2, machine1, machine2)

    def dep_to_4lang(self):
        dict_fn = self.cfg.get("dict", "output_file")
        logging.info('reading dependencies from {0}...'.format(dict_fn))
        longman = json.load(open(dict_fn))
        for c, (word, entry) in enumerate(longman.iteritems()):
            if c % 1000 == 0:
                logging.info("added {0}...".format(c))
            try:
                if entry["to_filter"]:
                    continue
                if not entry['senses']:
                    #  TODO these are words that only have pointers to an MWE
                    #  that they are part of.
                    continue
                definition = entry['senses'][0]['definition']
                if definition is None:
                    continue
                deps = definition['deps']
                if not deps:
                    #  TODO see previous comment
                    continue
                machine = self.get_dep_definition(word, deps)
                if machine is None:
                    continue

                # logging.info('adding: {0}'.format(word))
                # logging.info('ext_lex_keys: {0}'.format(
                    # self.lexicon.ext_lexicon.keys()))
                self.lexicon.add(word, machine)
            except Exception:
                logging.error(u"exception caused by: '{0}'".format(word))
                # logging.error(
                #     u'skipping "{0}" because of an exception:'.format(
                #         word))
                # logging.info("entry: {0}".format(entry))
                traceback.print_exc()
                sys.exit(-1)
                continue

        logging.info('added {0}, done!'.format(c + 1))

    def print_graphs(self):
        print_4lang_graphs(
            self.lexicon.ext_lexicon,
            self.cfg.get('machine', 'graph_dir'))

    def save_machines(self):
        self.lexicon.save_to_binary(self.out_fn)

    @staticmethod
    def parse_dependency(string):
        dep_match = DepTo4lang.dep_regex.match(string)
        if not dep_match:
            raise Exception('cannot parse dependency: {0}'.format(string))
        dep, word1, id1, word2, id2 = dep_match.groups()
        return dep, (word1, id1), (word2, id2)

    def get_root_lemmas(self, deps):
        return [
            d['dep'].setdefault(
                'lemma', self.lemmatizer.lemmatize(d['dep']['word']))
            for d in deps if d['type'] == 'root']  # TODO

    def get_dep_definition(self, word, deps):
        deps = self.dependency_processor.process_dependencies(deps)
        root_lemmas = self.get_root_lemmas(deps)
        if not root_lemmas:
            logging.warning(
                u'no root dependency, skipping word "{0}"'.format(word))
            return None

        word2machine = self.get_machines_from_deps_and_corefs(
            [deps], [], process_deps=False)

        root_machines = filter(None, map(word2machine.get, root_lemmas))
        if not root_machines:
            logging.info("failed to find root machine")
            logging.info('root lemmas: {0}'.format(root_lemmas))
            logging.info('word2machine: {0}'.format(word2machine))
            sys.exit(-1)

        word_machine = self.lexicon.get_new_machine(word)

        for root_machine in root_machines:
            word_machine.unify(root_machine)
            word_machine.append(root_machine, 0)
        return word_machine

    def get_machines_from_deps_and_corefs(
            self, dep_lists, corefs, process_deps=True):
        if process_deps:
            dep_lists = map(
                self.dependency_processor.process_dependencies, dep_lists)
        coref_index = defaultdict(dict)
        for (word, sen_no), mentions in corefs:
            for m_word, m_sen_no in mentions:
                coref_index[m_word][m_sen_no-1] = word

        # logging.info('coref index: {0}'.format(coref_index))

        word2machine = {}
        for deps in dep_lists:
            for dep in deps:
                for t in (dep['gov'], dep['dep']):
                    self.word2lemma[t['word']] = t.setdefault(
                        'lemma', self.lemmatizer.lemmatize(t['word']))

        for i, deps in enumerate(dep_lists):
            try:
                for dep in deps:
                    word1 = dep['gov']['word']
                    word2 = dep['dep']['word']
                    # logging.info('dep: {0}, w1: {1}, w2: {2}'.format(
                    #     repr(dep), repr(word1), repr(word2)))
                    c_word1 = coref_index[word1].get(i, word1)
                    c_word2 = coref_index[word2].get(i, word2)

                    """
                    if c_word1 != word1:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word1, c_word1))
                    if c_word2 != word2:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word2, c_word2))
                    """
                    lemma1 = self.word2lemma[c_word1]
                    lemma2 = self.word2lemma[c_word2]

                    # TODO
                    # lemma1 = lemma1.replace('/', '_PER_')
                    # lemma2 = lemma2.replace('/', '_PER_')

                    # logging.info(
                    #     'lemma1: {0}, lemma2: {1}'.format(
                    #         repr(lemma1), repr(lemma2)))

                    for lemma in (lemma1, lemma2):
                        if lemma not in word2machine:
                            word2machine[lemma] = self.lexicon.get_new_machine(
                                lemma)

                    self.apply_dep(
                        dep, word2machine[lemma1], word2machine[lemma2])

            except:
                logging.error(u"failure on dep: {0}({1}, {2})".format(
                    dep, word1, word2))
                traceback.print_exc()
                raise Exception("adding dependencies failed")

        return word2machine
Example #22
0
class WordSimilarity():
    def __init__(self, cfg, cfg_section='word_sim'):
        self.batch = cfg.getboolean(cfg_section, 'batch')

        logging.warning("fourlangpath is {0}".format(
            cfg.get(cfg_section, 'fourlangpath')))
        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
        self.sim_feats = SimFeatures(cfg, cfg_section, self.lexicon)
        self.expand = cfg.getboolean(cfg_section, "expand")
        compositional = cfg.getboolean('similarity', 'compositional')
        if compositional is True:
            self.text_to_4lang = TextTo4lang(cfg, direct_parse=True)
        logging.info("expand is {0}".format(self.expand))
        self.allow_4lang = cfg.getboolean('machine', 'allow_4lang')

    def log(self, string):
        if not self.batch:
            logging.info(string)

    def sim_type_to_function(self, sim_type):
        return lambda w1, w2: self.word_similarities(w1, w2)[sim_type]

    def machine_similarities(self, machine1, machine2, machine1_expand, machine2_expand):
        pn1, pn2 = machine1.printname(), machine2.printname()
        self.log(u'machine1: {0}, machine2: {1}'.format(pn1, pn2))

        links1, nodes1 = self.get_links_nodes(machine1)
        links2, nodes2 = self.get_links_nodes(machine2)
        links1_expand, nodes1_expand = self.get_links_nodes(machine1_expand)
        links2_expand, nodes2_expand = self.get_links_nodes(machine2_expand)

        self.log('links1: {0}, links2: {1}'.format(links1, links2))
        self.log('nodes1: {0}, nodes2: {1}'.format(nodes1, nodes2))
        self.log('links1_expand: {0}, links2_expand: {1}'.format(links1_expand, links2_expand))
        self.log('nodes1_expand: {0}, nodes2_expand: {1}'.format(nodes1_expand, nodes2_expand))

        sims = self.sim_feats.get_all_features(
            MachineInfo(machine1, nodes1, nodes1_expand, links1, links1_expand),
            MachineInfo(machine2, nodes2, nodes2_expand, links2, links2_expand))

        # TODO: we should use this way, but so far it didn't prove to be better
        # if sims['is_antonym'] == 1:
        #     sims['shortest_path'] = 0

        return sims

    def lemma_similarities(self, lemma1, lemma2):
        if (lemma1, lemma2) in self.lemma_sim_cache:
            return self.lemma_sim_cache[(lemma1, lemma2)]

        if lemma1 == lemma2:
            lemma_sims = self.sim_feats.one_similarities()

        machine1, machine2 = map(
            lambda l: self.lexicon.get_machine(l, allow_4lang=self.allow_4lang), (lemma1, lemma2))
        machine1_expand, machine2_expand = map(
            self.lexicon.get_expanded_definition, (lemma1, lemma2))

        if not self.batch:
            for w, m in ((lemma1, machine1), (lemma2, machine2)):
                print_4lang_graph(w, m, self.graph_dir)
            for w, m in ((lemma1, machine1_expand), (lemma2, machine2_expand)):
                print_4lang_graph(w, m, self.graph_dir + "_expand")

        lemma_sims = self.machine_similarities(machine1, machine2, machine1_expand, machine2_expand)

        self.lemma_sim_cache[(lemma1, lemma2)] = lemma_sims
        self.lemma_sim_cache[(lemma2, lemma1)] = lemma_sims
        return lemma_sims

    def word_similarities(self, word1, word2):
        if (word1, word2) in self.word_sim_cache:
            return self.word_sim_cache[(word1, word2)]
        lemma1, lemma2 = [self.lemmatizer.lemmatize(
            word, defined=self.defined_words, stem_first=True, uppercase=True)
                          for word in (word1, word2)]
        # self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2))
        if lemma1 is None or lemma2 is None:
            if lemma1 is None:
                logging.debug("OOV: {0}".format(word1))
            if lemma2 is None:
                logging.debug("OOV: {0}".format(word2))

            word_sims = self.sim_feats.zero_similarities()
        else:
            word_sims = self.lemma_similarities(lemma1, lemma2)
        self.word_sim_cache[(word1, word2)] = word_sims
        self.word_sim_cache[(word2, word1)] = word_sims
        return word_sims

    def phrase_similarities(self, phrase1, phrase2):
        words1 = phrase1.split(' ')
        words2 = phrase2.split(' ')
        if (len(words1) == 1 and len(words2) == 1):
            return self.word_similarities(phrase1, phrase2)
        else:
            # TODO: cache!
            machine1 = self.text_to_4lang.process_phrase(phrase1)
            machine2 = self.text_to_4lang.process_phrase(phrase2)
            nodes1 = self.get_nodes_from_text_machine(machine1)
            nodes2 = self.get_nodes_from_text_machine(machine2)
            sims = self.sim_feats.get_all_features(
                MachineInfo(machine1, nodes1, nodes1, None, None, has_printname=False),
                MachineInfo(machine2, nodes2, nodes2, None, None, has_printname=False))
            print "{0}\t{1}\t{2}".format(phrase1, phrase2, sims)
            return sims

    def get_nodes_from_text_machine(self, machine, excludes=["ROOT"]):
        return set(
            itertools.chain(*[self._get_all_nodes(k) for k in machine.values() if k.printname() not in set(excludes)]))
        # return [k for k in set(machine.keys()).difference(set(excludes))]

    def _get_all_nodes(self, machine):
        nodes = [m for m in MachineTraverser.get_nodes(machine, names_only=True, keep_upper=False)]
        return nodes

    def get_links_nodes(self, machine, use_cache=True):
        if use_cache and machine in self.links_nodes_cache:
            return self.links_nodes_cache[machine]
        self.seen_for_links = set()
        links, nodes = self._get_links_and_nodes(machine, depth=0)
        links, nodes = set(links), set(nodes)
        links.add(machine.printname())
        nodes.add(machine.printname())
        self.links_nodes_cache[machine] = (links, nodes)
        return links, nodes

    def _get_links_and_nodes(self, machine, depth, exclude_links=False):
        name = machine.printname()
        if name.isupper() or name == '=AGT':
            links, nodes = [], []
        elif exclude_links:
            links, nodes = [], [name]
        else:
            links, nodes = [name], [name]

        # logging.info("{0}{1},{2}".format(depth*"    ", links, nodes))
        is_negated = False
        is_before = False
        if machine in self.seen_for_links or depth > 5:
            return [], []
        self.seen_for_links.add(machine)
        for i, part in enumerate(machine.partitions):
            for hypernym in part:
                h_name = hypernym.printname()
                # logging.info("{0}h: {1}".format(depth*"    ", h_name))
                if h_name in ("lack", "not", "before"):
                    is_negated = True
                    continue

                c_links, c_nodes = self._get_links_and_nodes(
                    hypernym, depth=depth + 1, exclude_links=i != 0)

                if not h_name.isupper():
                    links += c_links
                nodes += c_nodes

        if not exclude_links:
            links += self.get_binary_links(machine)
        if is_negated:
            add_lack = lambda link: "lack_{0}".format(link) if isinstance(link, unicode) else (
                "lack_{0}".format(link[0]), link[1])  # nopep8
            links = map(add_lack, links)
            nodes = map(add_lack, nodes)

        return links, nodes

    def get_binary_links(self, machine):
        for parent, partition in machine.parents:
            parent_pn = parent.printname()
            # if not parent_pn.isupper() or partition == 0:
            if partition == 0:
                # haven't seen it yet but possible
                continue
            elif partition == 1:
                links = set([(parent_pn, other.printname())
                             for other in parent.partitions[2]])
            elif partition == 2:
                links = set([(other.printname(), parent_pn)
                             for other in parent.partitions[1]])
            else:
                raise Exception(
                    'machine {0} has more than 3 partitions!'.format(machine))
            for link in links:
                yield link

    def contains(self, links, machine):
        pn = machine.printname()
        for link in links:
            if link == pn or (pn in link and isinstance(link, tuple)):
                self.log('link "{0}" is/contains name "{1}"'.format(link, pn))
                return True
        else:
            return False
Example #23
0
class DepTo4lang():

    dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)")

    def __init__(self, cfg):
        self.cfg = cfg
        self.out_fn = self.cfg.get("machine", "ext_definitions")
        ensure_dir(os.path.dirname(self.out_fn))
        dep_map_fn = cfg.get("deps", "dep_map")
        self.read_dep_map(dep_map_fn)
        self.lemmatizer = Lemmatizer(cfg)

    def read_dep_map(self, dep_map_fn):
        self.dependencies = {}
        for line in file(dep_map_fn):
            l = line.strip()
            if not l or l.startswith('#'):
                continue
            dep = Dependency.create_from_line(l)
            self.dependencies[dep.name] = dep

    def apply_dep(self, dep_str, machine1, machine2):
        if dep_str not in self.dependencies:
            logging.warning(
                'skipping dependency not in dep_to_4lang map: {0}'.format(
                    dep_str))
            return False  # not that anyone cares
        self.dependencies[dep_str].apply(machine1, machine2)

    def dep_to_4lang(self):
        dict_fn = self.cfg.get("dict", "output_file")
        logging.info('reading dependencies from {0}...'.format(dict_fn))
        longman = json.load(open(dict_fn))
        self.words_to_machines = {}
        for c, (word, entry) in enumerate(longman.iteritems()):
            if c % 1000 == 0:
                logging.info("added {0}...".format(c))
            try:
                if entry["to_filter"]:
                    continue
                if not entry['senses']:
                    #  TODO these are words that only have pointers to an MWE
                    #  that they are part of.
                    continue
                definition = entry['senses'][0]['definition']
                if definition is None:
                    continue
                deps = definition['deps']
                if not deps:
                    #  TODO see previous comment
                    continue
                machine = self.get_dep_definition(word, deps)
                if machine is None:
                    continue
                self.words_to_machines[word] = machine
            except Exception:
                logging.error(
                    u'skipping "{0}" because of an exception:'.format(
                        word))
                logging.info("entry: {0}".format(entry))
                traceback.print_exc()
                continue

        logging.info('done!')

    def print_graphs(self):
        print_4lang_graphs(
            self.words_to_machines,
            self.cfg.get('machine', 'graph_dir'))

    def save_machines(self):
        logging.info('saving machines to {0}...'.format(self.out_fn))
        with open(self.out_fn, 'w') as out_file:
            cPickle.dump(self.words_to_machines, out_file)
        logging.info('done!')

    @staticmethod
    def parse_dependency(string):
        dep_match = DepTo4lang.dep_regex.match(string)
        if not dep_match:
            raise Exception('cannot parse dependency: {0}'.format(string))
        dep, word1, id1, word2, id2 = dep_match.groups()
        return dep, (word1, id1), (word2, id2)

    def get_dep_definition(self, word, deps):
        root_deps = filter(lambda d: d[0] == 'root', deps)
        if len(root_deps) != 1:
            logging.warning(
                u'no unique root dependency, skipping word "{0}"'.format(word))
            return None
        root_word, root_id = root_deps[0][2]
        root_lemma = self.lemmatizer.lemmatize(root_word).replace('/', '_PER_')
        root_lemma = root_word if not root_lemma else root_lemma

        word2machine = self.get_machines_from_parsed_deps(deps)

        root_machine = word2machine[root_lemma]
        word_machine = word2machine.get(word, Machine(word, ConceptControl()))
        word_machine.append(root_machine, 0)
        return word_machine

    def get_machines_from_deps(self, dep_strings):
        # deprecated, use get_machines_from_deps_and_corefs
        deps = map(DepTo4lang.parse_dependency, dep_strings)
        return self.get_machines_from_parsed_deps(deps)

    def get_machines_from_parsed_deps(self, deps):
        # deprecated, use get_machines_from_deps_and_corefs
        return self.get_machines_from_deps_and_corefs([deps], [])

    def get_machines_from_deps_and_corefs(self, dep_lists, corefs):
        coref_index = defaultdict(dict)
        for (word, sen_no), mentions in corefs:
            for m_word, m_sen_no in mentions:
                coref_index[m_word][m_sen_no-1] = word

        # logging.info('coref index: {0}'.format(coref_index))

        lexicon = Lexicon()
        word2machine = {}

        for i, deps in enumerate(dep_lists):
            try:
                for dep, (word1, id1), (word2, id2) in deps:
                    # logging.info('w1: {0}, w2: {1}'.format(word1, word2))
                    c_word1 = coref_index[word1].get(i, word1)
                    c_word2 = coref_index[word2].get(i, word2)

                    """
                    if c_word1 != word1:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word1, c_word1))
                    if c_word2 != word2:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word2, c_word2))
                    """

                    # logging.info(
                    #    'cw1: {0}, cw2: {1}'.format(c_word1, c_word2))
                    lemma1 = self.lemmatizer.lemmatize(c_word1)
                    lemma2 = self.lemmatizer.lemmatize(c_word2)

                    lemma1 = c_word1 if not lemma1 else lemma1
                    lemma2 = c_word2 if not lemma2 else lemma2

                    # TODO
                    lemma1 = lemma1.replace('/', '_PER_')
                    lemma2 = lemma2.replace('/', '_PER_')

                    # logging.info(
                    #     'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2))
                    machine1, machine2 = self._add_dependency(
                        dep, (lemma1, id1), (lemma2, id2), lexicon)

                    word2machine[lemma1] = machine1
                    word2machine[lemma2] = machine2
            except:
                logging.error("failure on dep: {0}({1}, {2})".format(
                    dep, word1, word2))
                traceback.print_exc()
                raise Exception("adding dependencies failed")

        return word2machine

    def _add_dependency(self, dep, (word1, id1), (word2, id2), lexicon):
        """Given a triplet from Stanford Dep.: D(w1,w2), we create and activate
        machines for w1 and w2, then run all operators associated with D on the
        sequence of the new machines (m1, m2)"""
        # logging.info(
        #     'adding dependency {0}({1}, {2})'.format(dep, word1, word2))
        machine1, machine2 = map(lexicon.get_machine, (word1, word2))

        self.apply_dep(dep, machine1, machine2)
        return machine1, machine2
Example #24
0
class DepTo4lang():

    dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)")

    def __init__(self, cfg):
        self.cfg = cfg
        self.lang = self.cfg.get("deps", "lang")
        self.out_fn = self.cfg.get("machine", "definitions_binary_out")
        ensure_dir(os.path.dirname(self.out_fn))
        self.dependency_processor = DependencyProcessor(self.cfg)
        dep_map_fn = cfg.get("deps", "dep_map")
        self.read_dep_map(dep_map_fn)
        self.undefined = set()
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.word2lemma = {}

    def read_dep_map(self, dep_map_fn):
        self.dependencies = defaultdict(list)
        for line in file(dep_map_fn):
            l = line.strip()
            if not l or l.startswith('#'):
                continue
            dep = Dependency.create_from_line(l)
            self.dependencies[dep.name].append(dep)

    def apply_dep(self, dep, machine1, machine2):
        dep_type = dep['type']
        msd1 = dep['gov'].get('msd')
        msd2 = dep['dep'].get('msd')
        if dep_type not in self.dependencies:
            if dep_type not in self.undefined:
                self.undefined.add(dep_type)
                logging.warning(
                    'skipping dependency not in dep_to_4lang map: {0}'.format(
                        dep_type))
            return False  # not that anyone cares
        for dep in self.dependencies[dep_type]:
            dep.apply(msd1, msd2, machine1, machine2)

    def dep_to_4lang(self):
        dict_fn = self.cfg.get("dict", "output_file")
        logging.info('reading dependencies from {0}...'.format(dict_fn))
        longman = json.load(open(dict_fn))
        for c, (word, entry) in enumerate(longman.iteritems()):
            if c % 1000 == 0:
                logging.info("added {0}...".format(c))
            try:
                if entry["to_filter"]:
                    continue
                if not entry['senses']:
                    #  TODO these are words that only have pointers to an MWE
                    #  that they are part of.
                    continue
                definition = entry['senses'][0]['definition']
                if definition is None:
                    continue
                deps = definition['deps']
                if not deps:
                    #  TODO see previous comment
                    continue
                machine = self.get_dep_definition(word, deps)
                if machine is None:
                    continue

                # logging.info('adding: {0}'.format(word))
                # logging.info('ext_lex_keys: {0}'.format(
                # self.lexicon.ext_lexicon.keys()))
                self.lexicon.add(word, machine)
            except Exception:
                logging.error(u"exception caused by: '{0}'".format(word))
                # logging.error(
                #     u'skipping "{0}" because of an exception:'.format(
                #         word))
                # logging.info("entry: {0}".format(entry))
                traceback.print_exc()
                sys.exit(-1)
                continue

        logging.info('added {0}, done!'.format(c + 1))

    def print_graphs(self):
        print_4lang_graphs(self.lexicon.ext_lexicon,
                           self.cfg.get('machine', 'graph_dir'))

    def save_machines(self):
        self.lexicon.save_to_binary(self.out_fn)

    @staticmethod
    def parse_dependency(string):
        dep_match = DepTo4lang.dep_regex.match(string)
        if not dep_match:
            raise Exception('cannot parse dependency: {0}'.format(string))
        dep, word1, id1, word2, id2 = dep_match.groups()
        return dep, (word1, id1), (word2, id2)

    def get_root_lemmas(self, deps):
        return [
            d['dep'].setdefault('lemma',
                                self.lemmatizer.lemmatize(d['dep']['word']))
            for d in deps if d['type'] == 'root'
        ]  # TODO

    def get_dep_definition(self, word, deps):
        deps = self.dependency_processor.process_dependencies(deps)
        root_lemmas = self.get_root_lemmas(deps)
        if not root_lemmas:
            logging.warning(
                u'no root dependency, skipping word "{0}"'.format(word))
            return None

        word2machine = self.get_machines_from_deps_and_corefs(
            [deps], [], process_deps=False)

        root_machines = filter(None, map(word2machine.get, root_lemmas))
        if not root_machines:
            logging.info("failed to find root machine")
            logging.info('root lemmas: {0}'.format(root_lemmas))
            logging.info('word2machine: {0}'.format(word2machine))
            sys.exit(-1)

        word_machine = self.lexicon.get_new_machine(word)

        for root_machine in root_machines:
            word_machine.unify(root_machine)
            word_machine.append(root_machine, 0)
        return word_machine

    def get_machines_from_deps_and_corefs(self,
                                          dep_lists,
                                          corefs,
                                          process_deps=True):
        if process_deps:
            dep_lists = map(self.dependency_processor.process_dependencies,
                            dep_lists)
        coref_index = defaultdict(dict)
        for (word, sen_no), mentions in corefs:
            for m_word, m_sen_no in mentions:
                coref_index[m_word][m_sen_no - 1] = word

        # logging.info('coref index: {0}'.format(coref_index))

        word2machine = {}
        for deps in dep_lists:
            for dep in deps:
                for t in (dep['gov'], dep['dep']):
                    self.word2lemma[t['word']] = t.setdefault(
                        'lemma', self.lemmatizer.lemmatize(t['word']))

        for i, deps in enumerate(dep_lists):
            try:
                for dep in deps:
                    word1 = dep['gov']['word']
                    word2 = dep['dep']['word']
                    # logging.info('dep: {0}, w1: {1}, w2: {2}'.format(
                    #     repr(dep), repr(word1), repr(word2)))
                    c_word1 = coref_index[word1].get(i, word1)
                    c_word2 = coref_index[word2].get(i, word2)
                    """
                    if c_word1 != word1:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word1, c_word1))
                    if c_word2 != word2:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word2, c_word2))
                    """
                    lemma1 = self.word2lemma[c_word1]
                    lemma2 = self.word2lemma[c_word2]

                    # TODO
                    # lemma1 = lemma1.replace('/', '_PER_')
                    # lemma2 = lemma2.replace('/', '_PER_')

                    # logging.info(
                    #     'lemma1: {0}, lemma2: {1}'.format(
                    #         repr(lemma1), repr(lemma2)))

                    for lemma in (lemma1, lemma2):
                        if lemma not in word2machine:
                            word2machine[lemma] = self.lexicon.get_new_machine(
                                lemma)

                    self.apply_dep(dep, word2machine[lemma1],
                                   word2machine[lemma2])

            except:
                logging.error(u"failure on dep: {0}({1}, {2})".format(
                    dep, word1, word2))
                traceback.print_exc()
                raise Exception("adding dependencies failed")

        return word2machine
Example #25
0
class WordSimilarity():
    def __init__(self, cfg, cfg_section='word_sim'):
        self.batch = cfg.getboolean(cfg_section, 'batch')

        logging.warning("fourlangpath is {0}".format(
            cfg.get(cfg_section, 'fourlangpath')))
        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
        self.sim_feats = SimFeatures(cfg, cfg_section)
        self.expand = cfg.getboolean(cfg_section, "expand")
        logging.info("expand is {0}".format(self.expand))

    def log(self, string):
        if not self.batch:
            logging.info(string)

    def sim_type_to_function(self, sim_type):
        return lambda w1, w2: self.word_similarities(w1, w2)[sim_type]

    def machine_similarities(self, machine1, machine2, machine1_expand, machine2_expand):
        pn1, pn2 = machine1.printname(), machine2.printname()
        self.log(u'machine1: {0}, machine2: {1}'.format(pn1, pn2))

        links1, nodes1 = self.get_links_nodes(machine1)
        links2, nodes2 = self.get_links_nodes(machine2)
        links1_expand, nodes1_expand = self.get_links_nodes(machine1_expand)
        links2_expand, nodes2_expand = self.get_links_nodes(machine2_expand)

        self.log('links1: {0}, links2: {1}'.format(links1, links2))
        self.log('nodes1: {0}, nodes2: {1}'.format(nodes1, nodes2))
        self.log('links1_expand: {0}, links2_expand: {1}'.format(links1_expand, links2_expand))
        self.log('nodes1_expand: {0}, nodes2_expand: {1}'.format(nodes1_expand, nodes2_expand))

        sims = self.sim_feats.get_all_features(MachineInfo(machine1_expand, nodes1, nodes1_expand, links1, links1_expand),
                                               MachineInfo(machine2_expand, nodes2, nodes2_expand, links2, links2_expand))
        return sims

    def lemma_similarities(self, lemma1, lemma2):
        if (lemma1, lemma2) in self.lemma_sim_cache:
            return self.lemma_sim_cache[(lemma1, lemma2)]

        if lemma1 == lemma2:
            lemma_sims = self.sim_feats.one_similarities()

        machine1, machine2 = map(
                self.lexicon.get_machine, (lemma1, lemma2))
        machine1_expand, machine2_expand = map(
                self.lexicon.get_expanded_definition, (lemma1, lemma2))

        if not self.batch:
            for w, m in ((lemma1, machine1), (lemma2, machine2)):
                print_4lang_graph(w, m, self.graph_dir)
            for w, m in ((lemma1, machine1_expand), (lemma2, machine2_expand)):
                print_4lang_graph(w, m, self.graph_dir + "_expand")

        lemma_sims = self.machine_similarities(machine1, machine2, machine1_expand, machine2_expand)

        self.lemma_sim_cache[(lemma1, lemma2)] = lemma_sims
        self.lemma_sim_cache[(lemma2, lemma1)] = lemma_sims
        return lemma_sims

    def word_similarities(self, word1, word2):
        if (word1, word2) in self.word_sim_cache:
            return self.word_sim_cache[(word1, word2)]
        lemma1, lemma2 = [self.lemmatizer.lemmatize(
            word, defined=self.defined_words, stem_first=True)
            for word in (word1, word2)]
        # self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2))
        if lemma1 is None or lemma2 is None:
            if lemma1 is None:
                logging.debug("OOV: {0}".format(word1))
            if lemma2 is None:
                logging.debug("OOV: {0}".format(word2))

            word_sims = self.sim_feats.zero_similarities()
        else:
            word_sims = self.lemma_similarities(lemma1, lemma2)
        self.word_sim_cache[(word1, word2)] = word_sims
        self.word_sim_cache[(word2, word1)] = word_sims
        return word_sims

    def get_links_nodes(self, machine, use_cache=True):
        if use_cache and machine in self.links_nodes_cache:
            return self.links_nodes_cache[machine]
        self.seen_for_links = set()
        links, nodes = self._get_links_and_nodes(machine, depth=0)
        links, nodes = set(links), set(nodes)
        links.add(machine.printname())
        nodes.add(machine.printname())
        self.links_nodes_cache[machine] = (links, nodes)
        return links, nodes

    def _get_links_and_nodes(self, machine, depth, exclude_links=False):
        name = machine.printname()
        if name.isupper() or name == '=AGT':
            links, nodes = [], []
        elif exclude_links:
            links, nodes = [], [name]
        else:
            links, nodes = [name], [name]

        # logging.info("{0}{1},{2}".format(depth*"    ", links, nodes))
        is_negated = False
        is_before = False
        if machine in self.seen_for_links or depth > 5:
            return [], []
        self.seen_for_links.add(machine)
        for i, part in enumerate(machine.partitions):
            for hypernym in part:
                h_name = hypernym.printname()
                # logging.info("{0}h: {1}".format(depth*"    ", h_name))
                if h_name in ("lack", "not", "before"):
                    is_negated = True
                    continue

                c_links, c_nodes = self._get_links_and_nodes(
                    hypernym, depth=depth+1, exclude_links=i != 0)

                if not h_name.isupper():
                    links += c_links
                nodes += c_nodes

        if not exclude_links:
            links += self.get_binary_links(machine)
        if is_negated:
            add_lack = lambda link: "lack_{0}".format(link) if isinstance(link, unicode) else ("lack_{0}".format(link[0]), link[1])  # nopep8
            links = map(add_lack, links)
            nodes = map(add_lack, nodes)

        return links, nodes

    def get_binary_links(self, machine):
        for parent, partition in machine.parents:
            parent_pn = parent.printname()
            # if not parent_pn.isupper() or partition == 0:
            if partition == 0:
                # haven't seen it yet but possible
                continue
            elif partition == 1:
                links = set([(parent_pn, other.printname())
                            for other in parent.partitions[2]])
            elif partition == 2:
                links = set([(other.printname(), parent_pn)
                            for other in parent.partitions[1]])
            else:
                raise Exception(
                    'machine {0} has more than 3 partitions!'.format(machine))
            for link in links:
                yield link

    def contains(self, links, machine):
        pn = machine.printname()
        for link in links:
            if link == pn or (pn in link and isinstance(link, tuple)):
                self.log('link "{0}" is/contains name "{1}"'.format(link, pn))
                return True
        else:
            return False
Example #26
0
 def __init__(self, wiki_file: str) -> None:
     self.wiki_file = wiki_file
     self.l = Lemmatizer()  # noqa
Example #27
0
from n_grams import N_grams

projeto = 'jquery'
arquivos = listdir('pull requests ' + projeto)

raw = ''
for arquivo in arquivos:
    with open('pull requests ' + projeto + '/' + arquivo) as json_file:
        data = json.load(json_file)
        raw = raw + str(data['body'])
texto, codigo = separar_codigo.separar(raw)

tokens = Tokennizer.tokenize(texto)
# stemmed_list = Stemmer.stemmer(tokens)
stop_worded_list = Stop_words.stop_words(tokens)
lemmatized_list = Lemmatizer.lemmatizer(stop_worded_list)
bigrams = N_grams.n_grams(lemmatized_list, 2)
trigrams = N_grams.n_grams(lemmatized_list, 3)

f_bi = FreqDist(bigrams)
f_tri = FreqDist(trigrams)
frequentes = FreqDist(lemmatized_list)

porcentagem = int(len(frequentes) / 10)

f = open('vocabularios/' + projeto + '/tokens.txt', 'w', encoding="utf-8")
for item in frequentes.most_common(porcentagem):
    # print(str(item[0]) + '; ' + str(item[1]))
    f.write(str(item[0]) + '; ' + str(item[1]) + '\n')
f.close()
Example #28
0
import sys
from lemmatizer import Lemmatizer
src = sys.argv[1]
tgt = sys.argv[2]
lemm_cz = Lemmatizer(src,
                     "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % src,
                     "il2",
                     path="/home/big_maggie/usr/nmt_scripts/liblemm.so")
lemm_en = Lemmatizer(tgt,
                     "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % tgt,
                     "il2",
                     path="/home/big_maggie/usr/nmt_scripts/liblemm.so")
#TODO: pro kazdou vetu nahradit entity, ktere vytvoril tokenizator, opet puvodnimi tokeny
for line in sys.stdin:
    #line=line.decode('utf-8')
    print('\t'.join(
        (str(lemm_cz.get_lang(line, 0.5,
                              src)), str(lemm_en.get_lang(line, 0.5, tgt)))))
Example #29
0
def main():
    # read data from the raw data file
    file_reader = FileReader('train.csv')
    # get text from raw data
    train = file_reader.get_text()
    # get label and class from raw data
    labels, cla = file_reader.get_labels()

    # because all the basic function are implemented by ourself in this project
    # it will take a longer time to do the data preprocessing compare with nltk inbuild function
    # Therefore, we used only 10k data to test from alg here
    train_list = list(train)[:10000]
    # store data after cleaning
    print(
        'Clean the data, remove special punctuations, numbers and abbreviations....'
    )
    clean_list = list()
    cleaner = DataClean()
    for train_data in train_list:
        clean_list.append(cleaner.clean(train_data))
    print('Data clean done!')
    print('')
    tkn = Tokenizer()
    # train a random forest pos tagger classfication model
    print('Training a pos tagger classfication model....')
    pos_tagger, onehot_enc = train_pos_tag()
    print('Model training done!')
    print('')
    text_list = list()
    # split text into sents before pos_tag
    print('Start tokenizing and lemmatizing....')
    print('This step will take a few minutes')
    for clean_data in clean_list:
        sents = tkn.sent_tokenize(clean_data)
        text_list.append(sents)
    # features for pos_tag
    features = [
        'word', 'is_first_word', 'is_last_word', 'prev_word',
        'prev_word_last_1', 'prev_word_last_2', 'next_word', 'is_numeric',
        'first_1', 'first_2', 'first_3', 'first_4', 'last_1', 'last_2',
        'last_3', 'last_4', 'is_numeric', 'word_has_hyphen'
    ]
    # init Lemmatizer
    lem = Lemmatizer()
    lem_texts = list()

    # tokenize, pos_tag and lammatize sentence by sentence
    for sents in text_list:
        word_features = pd.DataFrame(get_data_label(sents, label=False))
        # some data is empty
        if not word_features.empty:
            word_encode = word_features[features].values
            word_encode = onehot_enc.transform(word_encode)
            pred_pos = pos_tagger.predict(word_encode)

            lem_text = list()
            text = word_features.word
            for index in range(len(text)):
                lem_text.append(
                    lem.lemmatize(text[index], tag_map(pred_pos[index])))

            lem_texts.append(lem_text)
        else:
            lem_texts.append([])
    print('Done!')
    print('')

    print('Start building the Vocabulary for our data....')
    voc = Vocabulary(lem_texts)
    voc.remove_stop_words()
    print('Done!')
    print('')

    print('Calculating idf....')
    print('It may take 3 minutes in this step')

    # get idf word dict from Vocabulary
    idf_reference = voc.idf()
    idf = np.zeros([len(voc)])

    for word in idf_reference:
        idf[voc.pos(word)] = idf_reference[word]
    print('idf done!')
    print('')

    # the tf-idf encode array
    data_array = np.zeros([len(lem_texts), len(voc)], dtype='int16')
    print('Calculating tf-idf....')
    for index, text in enumerate(lem_texts):

        vec = Vector(text, voc)
        data_array[index] = idf * vec.tf()
    print('Done!')
    print('')

    X, Y, test_X, test_Y = train_test_split(data_array, labels, test_size=0.5)

    # split the train set into 5 fold for Cross Validation
    # However Cross Validation is time consuming and not necessary in this project
    # We just use one val set to choose the best threshold
    k = 5
    fold_list = k_fold(X, k=k)
    one_size = len(fold_list[0])
    train_X = np.zeros([one_size * 4, test_X.shape[1]])
    train_Y = np.zeros([one_size * 4, 6], dtype='int64')

    # split train dataset and validation dataset
    for index, fold in enumerate(fold_list):
        if index != k - 1:
            train_X[index * one_size:index * one_size + one_size] = X[fold]
            train_Y[index * one_size:index * one_size + one_size] = Y[fold]
        else:
            val_X = X[fold]
            val_Y = Y[fold]

    preds = np.zeros((len(val_X), len(cla)))
    Pred_test = np.zeros((len(test_X), len(cla)))

    # We use LogisticRegression to train 6 models for each cat
    for index, cat in enumerate(cla):
        print('fit', cat)
        m, r = get_mdl(train_Y[:, index], train_X)
        preds[:, index] = m.predict_proba(val_X * r)[:, 1]
        Pred_test[:, index] = m.predict_proba(test_X * r)[:, 1]

    # searching for the best threshold
    threshold = [0.55, 0.6, 0.65, 0.7, 0.75]
    reslut_list = list()
    for t in threshold:
        sum_result = 0
        row, col = preds.shape
        pred_Y = np.zeros([row, col])
        for i in range(row):
            for j in range(col):
                if preds[i, j] >= t:
                    pred_Y[i, j] = 1
                else:
                    pred_Y[i, j] = 0

        # print out the pred result
        print(f'Validation set Accuracy (threshold={t}):')
        for index, cat in enumerate(cla):
            result = (pred_Y[:, index] == val_Y[:, index]).sum() / len(pred_Y)
            sum_result += result
            print(f'{cat} : {result}')
        print('')
        reslut_list.append(sum_result)

    # Using the best threshold pred test data set
    t = threshold[np.argmax(np.array(reslut_list))]
    print(f'The best threshold is {t}')
    row, col = Pred_test.shape
    pred_test_Y = np.zeros([row, col])
    for i in range(row):
        for j in range(col):
            if Pred_test[i, j] >= t:
                pred_test_Y[i, j] = 1
            else:
                pred_test_Y[i, j] = 0
    print('')
    print('#######################################')
    print('#######################################')
    print(f'Test set Accuracy (threshold={t}):')
    for index, cat in enumerate(cla):
        result = (pred_test_Y[:, index]
                  == test_Y[:, index]).sum() / len(pred_test_Y)
        print(f'{cat} : {result}')
Example #30
0
 def __init__(self, dictionary):
   self.dictionary = dictionary
   self.lemmatizer = Lemmatizer(dictionary)
   self.rules = RULES
   self.tag_query_cache = { }  # runtime use for tag query in dictionary
Example #31
0
class Parser:

  class Chart:

    class Vertex:
      
      def __init__(self, token=''):
        self.token = token

    class Edge:
      
      def __init__(self, lpos:int, rpos:int, unit:str, state:list):
        self.lpos = lpos
        self.rpos = rpos
        self.unit = unit
        self.state = state    # the right unscanned part
      
      def __lt__(self, other):
        if self.lpos != other.lpos: return self.lpos < other.lpos
        elif self.rpos != other.rpos: return self.rpos < other.rpos
        else: return self.unit <= other.unit

      def __eq__(self, other):
        return (self.lpos == other.lpos and self.rpos == other.rpos
                and self.unit == other.unit and self.state == other.state)

    def __init__(self):
      self.edges_active = [ ]
      self.edges_inactive = [ ]
      self.vertexes = [ ]   # the rank is crucial

    def __str__(self):
      lines, vertex_flow = [ ], ''
      for i, v in enumerate(self.vertexes):
        vertex_flow += ' <%d> %s' % (i + 1, v.token)
      vertex_flow += ' <%d> ' % (len(self.vertexes) + 1)
      nlen = len(vertex_flow)
      lines.append('=' * nlen)
      lines.append(vertex_flow)
      lines.append('-' * nlen)
      rel = { }  # { (int, int): [str] }
      for e in self.edges_inactive:
        span = (e.lpos, e.rpos)
        if span in rel: rel[span].append(e.unit)
        else: rel[span] = [e.unit]
      for k in sorted(rel):
        lines.append('%r: %r' % (k, rel[k]))
      lines.append('>> found %d releations/edges.' % len(self.edges_inactive))
      lines.append('')
      return '\n'.join(lines)

    def add_vertex(self, label:str):
      self.vertexes.append(self.Vertex(label))
      return len(self.vertexes)

    def add_edge(self, lpos, rpos, tag, state:list=None):
      e = self.Edge(lpos, rpos, tag, state)
      
      if state is not None:
        if e not in self.edges_active:
          self.edges_active.append(e)
      else:
        if e not in self.edges_inactive:
          self.edges_inactive.append(e)
  
  INSTANE = None

  def __new__(cls, *args, **kwargs):
    if not cls.INSTANE:
      cls.INSTANE = super().__new__(cls)
    return cls.INSTANE

  def __init__(self, dictionary):
    self.dictionary = dictionary
    self.lemmatizer = Lemmatizer(dictionary)
    self.rules = RULES
    self.tag_query_cache = { }  # runtime use for tag query in dictionary
  
  def parse(self, sent):
    tokens = sent.split()
    agenda, agenda_hist = [ ], set()  # stack and its visit record to aviod duplicate push
    chart = self.Chart()
    while agenda or tokens:
      if not agenda:
        tok, tokens = tokens[0], tokens[1:]
        tok = self.lemmatizer.lemmatize(tok)

        tags = self.tag_query_cache.get(tok)
        if not tags:
          tags = {k for k, v in VOCABULARY.items() if tok in v}
          if not tags:
            dtags = (tok in self.dictionary
                     and {tag for tag, _ in self.dictionary[tok]}
                     or set())
            tags = fuck_dtags_to_tags(dtags)
          self.tag_query_cache[tok] = tags

        idx = chart.add_vertex(tok)
        for tag in tags:
          todo = (tag, idx, idx + 1)
          agenda.append(todo)
          agenda_hist.add(todo)
      else:
        target, lpos, rpos = agenda.pop()
        for unit, unscanned in self.rules:
          if unscanned and unscanned[0] == target:
            if len(unscanned) > 1:
              chart.add_edge(lpos, rpos, unit, unscanned[1:])
            else:
              todo = (unit, lpos, rpos)
              if todo not in agenda_hist:
                agenda.append(todo)
                agenda_hist.add(todo)
        chart.add_edge(lpos, rpos, target)
        for e in chart.edges_active:
          # rule alive: str, [] => unit, unscanned
          unit, unscanned = e.unit, e.state
          if unscanned and unscanned[0] == target:
            if len(unscanned) > 1:
              chart.add_edge(e.lpos, rpos, unit, unscanned[1:])
            else:
              todo = (unit, e.lpos, rpos)
              if todo not in agenda_hist:
                agenda.append(todo)
                agenda_hist.add(todo)
      # print(agenda)
    return chart
Example #32
0
class WordSimilarity():
    def __init__(self, cfg):
        try:
            self.batch = cfg.getboolean('similarity_machine', 'batch')
        except NoSectionError:
            self.batch = False

        self.cfg = cfg
        self.lemmatizer = Lemmatizer(cfg)
        self.machine_wrapper = MachineWrapper(cfg)
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))

    def log(self, string):
        if not self.batch:
            logging.info(string)

    def get_links_nodes(self, machine, use_cache=True):
        if use_cache and machine in self.links_nodes_cache:
            return self.links_nodes_cache[machine]
        self.seen_for_links = set()
        links = set()
        nodes = set()
        for link, node in self._get_links_nodes(machine, depth=0):
            if link is not None:
                links.add(link)
            if node is not None:
                nodes.add(node)
        self.links_nodes_cache[machine] = (links, nodes)
        return links, nodes

    def _get_links_nodes(self, machine, depth):
        if machine in self.seen_for_links or depth > 5:
            return
        self.seen_for_links.add(machine)
        for hypernym in machine.partitions[0]:
            name = hypernym.printname()
            if name == '=AGT' or not name.isupper():
                # if depth == 0 and name not in ("lack", "to"):  # TMP!!!
                yield name, None

            for link, node in self._get_links_nodes(hypernym, depth=depth+1):
                yield link, node

        for link, node in self.get_binary_links_nodes(machine):
            yield link, node

        for node in MachineTraverser.get_nodes(machine):
            yield None, node

    def get_binary_links_nodes(self, machine):
        for parent, partition in machine.parents:
            parent_pn = parent.printname()
            # if not parent_pn.isupper() or partition == 0:
            if partition == 0:
                # haven't seen it yet but possible
                continue
            elif partition == 1:
                links = set([(parent_pn, other.printname())
                            for other in parent.partitions[2]])
                nodes = [m.printname() for m in parent.partitions[2]]
            elif partition == 2:
                links = set([(other.printname(), parent_pn)
                            for other in parent.partitions[1]])
                nodes = [m.printname() for m in parent.partitions[1]]
            else:
                raise Exception(
                    'machine {0} has more than 3 partitions!'.format(machine))

            for link in links:
                yield link, None
            for node in nodes:
                yield None, node

    def link_similarity(self, links1, links2):
        pass

    def contains(self, links, machine):
        pn = machine.printname()
        for link in links:
            if link == pn or (pn in link and isinstance(link, tuple)):
                self.log('link "{0}" is/contains name "{1}"'.format(link, pn))
                return True
        else:
            return False

    def machine_similarity(self, machine1, machine2, sim_type):
        pn1, pn2 = machine1.printname(), machine2.printname()
        self.log(u'machine1: {0}, machine2: {1}'.format(pn1, pn2))
        if sim_type == 'default':
            # sim = harmonic_mean((
            #     self._all_pairs_similarity(machine1, machine2),
            #     self._links_and_nodes_similarity(machine1, machine2)))
            sim = self._links_and_nodes_similarity(machine1, machine2)
            #                                      exclude_nodes=True)  # TMP!!
        elif sim_type == 'all_pairs':
            sim = self._all_pairs_similarity(machine1, machine2)
        elif sim_type == 'links_and_nodes':
            sim = self._links_and_nodes_similarity(machine1, machine2)
        elif sim_type == 'strict_links_and_nodes':
            sim = self._links_and_nodes_similarity(machine1, machine2,
                                                   no_contain_score=True)
        elif sim_type == 'links':
            sim = self._links_and_nodes_similarity(machine1, machine2,
                                                   exclude_nodes=True)
        elif sim_type == 'strict_links':
            sim = self._links_and_nodes_similarity(machine1, machine2,
                                                   exclude_nodes=True,
                                                   no_contain_score=True)
        else:
            raise Exception("unknown similarity type: {0}".format(sim_type))
        return sim

    def _all_pairs_similarity(self, machine1, machine2):
        words1 = set(MachineTraverser.get_nodes(machine1,
                                                exclude_words=self.stopwords))
        words2 = set(MachineTraverser.get_nodes(machine2,
                                                exclude_words=self.stopwords))
        pair_sims_by_word = defaultdict(dict)
        for word1 in words1:
            for word2 in words2:
                sim = self.word_similarity(word1, word2, -1, -1,
                                           sim_type="strict_links_and_nodes")
                pair_sims_by_word[word1][word2] = sim if sim else 0.0
                pair_sims_by_word[word2][word1] = sim if sim else 0.0

        max_sims_by_word = dict((
            (word, my_max(pair_sims_by_word[word].itervalues()))
            for word in words1 | words2))

        sim = average((average((max_sims_by_word[w] for w in words1)),
                       average((max_sims_by_word[w] for w in words2))))
        # sim = max((my_max((max_sims_by_word[w] for w in words1)),
        #            my_max((max_sims_by_word[w] for w in words2))))
        if sim:
            self.log(
                "{0} - {1} all_pairs similarity: {2} based on: {3}".format(
                    machine1.printname(), machine2.printname(), sim,
                    pair_sims_by_word))
        return sim

    def _links_and_nodes_similarity(self, machine1, machine2,
                                    exclude_nodes=False,
                                    no_contain_score=False):
        sim = 0
        links1, nodes1 = self.get_links_nodes(machine1)
        links2, nodes2 = self.get_links_nodes(machine2)
        if not no_contain_score:
            if (self.contains(links1, machine2) or
                    self.contains(links2, machine1)):
                sim = max(sim, 0.35)
            elif (not exclude_nodes) and (self.contains(nodes1, machine2) or
                                          self.contains(nodes2, machine1)):
                sim = max(sim, 0.25)
        self.log('links1: {0}, links2: {1}'.format(links1, links2))
        self.log('nodes1: {0}, nodes2: {1}'.format(nodes1, nodes2))
        if True:
            pn1, pn2 = machine1.printname(), machine2.printname()
            if pn1 in links2 or pn2 in links1:
                self.log(
                    "{0} and {1} connected by 0-path, returning 1".format(
                        pn1, pn2))
                return 1
        entities1 = filter(lambda l: "@" in l, links1)
        entities2 = filter(lambda l: "@" in l, links2)
        if entities1 or entities2:
            sim = max(sim, jaccard(entities1, entities2))
        else:
            sim = max(sim, jaccard(links1, links2))
            if not exclude_nodes:
                node_sim = jaccard(nodes1, nodes2)
                if node_sim > sim:
                    self.log(
                        'picking node sim ({0}) over link sim ({1})'.format(
                            node_sim, sim))
                    sim = node_sim

        return sim

    def word_similarity(self, word1, word2, pos1, pos2, sim_type='default',
                        fallback=lambda a, b, c, d: None):
        self.log(u'words: {0}, {1}'.format(word1, word2))
        lemma1, lemma2 = [self.lemmatizer.lemmatize(
            word, defined=self.machine_wrapper.definitions, stem_first=True)
            for word in (word1, word2)]
        self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2))
        if lemma1 is None or lemma2 is None:
            return fallback(word1, word2, pos1, pos2)
        sim = self.lemma_similarity(lemma1, lemma2, sim_type)
        self.log(u"S({0}, {1}) = {2}".format(word1, word2, sim))
        return sim

    def lemma_similarity(self, lemma1, lemma2, sim_type):
        if (lemma1, lemma2) in self.lemma_sim_cache:
            return self.lemma_sim_cache[(lemma1, lemma2)]
        elif lemma1 == lemma2:
            return 1
        self.log(u'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2))

        machines1 = self.machine_wrapper.definitions[lemma1]
        machines2 = self.machine_wrapper.definitions[lemma2]

        pairs_by_sim = sorted([
            (self.machine_similarity(machine1, machine2, sim_type),
             (machine1, machine2))
            for machine1 in machines1 for machine2 in machines2], reverse=True)

        sim, (machine1, machine2) = pairs_by_sim[0]

        sim = sim if sim >= 0 else 0
        self.lemma_sim_cache[(lemma1, lemma2)] = sim
        self.lemma_sim_cache[(lemma2, lemma1)] = sim
        return sim
Example #33
0
class WordSimilarity:
    sim_types = set(
        ["links_jaccard", "nodes_jaccard", "links_contain", "nodes_contain", "0-connected", "entities_jaccard"]
    )

    def __init__(self, cfg, cfg_section="word_sim"):
        try:
            self.batch = cfg.getboolean(cfg_section, "batch")
        except NoSectionError:
            self.batch = False

        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words("english"))
        self.expand = cfg.getboolean(cfg_section, "expand")
        logging.info("expand is {0}".format(self.expand))

    def log(self, string):
        if not self.batch:
            logging.info(string)

    def uniform_similarities(self, s):
        return dict(((sim_type, s) for sim_type in WordSimilarity.sim_types))
        # TODO return {sim_type: s for sim_type in WordSimilarity.sim_types}

    def zero_similarities(self):
        return self.uniform_similarities(0.0)

    def one_similarities(self):
        return self.uniform_similarities(1.0)

    def sim_type_to_function(self, sim_type):
        return lambda w1, w2: self.word_similarities(w1, w2)[sim_type]

    def machine_similarities(self, machine1, machine2):
        pn1, pn2 = machine1.printname(), machine2.printname()
        self.log(u"machine1: {0}, machine2: {1}".format(pn1, pn2))

        sims = self.zero_similarities()
        links1, nodes1 = self.get_links_nodes(machine1)
        links2, nodes2 = self.get_links_nodes(machine2)
        self.log("links1: {0}, links2: {1}".format(links1, links2))
        self.log("nodes1: {0}, nodes2: {1}".format(nodes1, nodes2))
        if self.contains(links1, machine2) or self.contains(links2, machine1):
            sims["links_contain"] = 1

        if self.contains(nodes1, machine2) or self.contains(nodes2, machine1):
            sims["nodes_contain"] = 1

        pn1, pn2 = machine1.printname(), machine2.printname()
        # TODO
        if pn1 in links2 or pn2 in links1:
            sims["0-connected"] = 1

        entities1 = filter(lambda l: "@" in l, links1)
        entities2 = filter(lambda l: "@" in l, links2)
        sims["entities_jaccard"] = jaccard(entities1, entities2)

        sims["links_jaccard"] = jaccard(links1, links2)
        sims["nodes_jaccard"] = jaccard(nodes1, nodes2)

        return sims

    def lemma_similarities(self, lemma1, lemma2):
        if (lemma1, lemma2) in self.lemma_sim_cache:
            return self.lemma_sim_cache[(lemma1, lemma2)]

        if lemma1 == lemma2:
            lemma_sims = self.one_similarities()

        if self.expand:
            machine1, machine2 = map(self.lexicon.get_expanded_definition, (lemma1, lemma2))
        else:
            machine1, machine2 = map(self.lexicon.get_machine, (lemma1, lemma2))

        if not self.batch:
            for w, m in ((lemma1, machine1), (lemma2, machine2)):
                print_4lang_graph(w, m, self.graph_dir)
        lemma_sims = self.machine_similarities(machine1, machine2)
        self.lemma_sim_cache[(lemma1, lemma2)] = lemma_sims
        self.lemma_sim_cache[(lemma2, lemma1)] = lemma_sims
        return lemma_sims

    def word_similarities(self, word1, word2):
        if (word1, word2) in self.word_sim_cache:
            return self.word_sim_cache[(word1, word2)]
        lemma1, lemma2 = [
            self.lemmatizer.lemmatize(word, defined=self.defined_words, stem_first=True) for word in (word1, word2)
        ]
        # self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2))
        if lemma1 is None or lemma2 is None:
            if lemma1 is None:
                logging.debug("OOV: {0}".format(word1))
            if lemma2 is None:
                logging.debug("OOV: {0}".format(word2))
            # TODO
            word_sims = self.zero_similarities()
        else:
            word_sims = self.lemma_similarities(lemma1, lemma2)
        self.word_sim_cache[(word1, word2)] = word_sims
        self.word_sim_cache[(word2, word1)] = word_sims
        return word_sims

    def get_links_nodes(self, machine, use_cache=True):
        if use_cache and machine in self.links_nodes_cache:
            return self.links_nodes_cache[machine]
        self.seen_for_links = set()
        links, nodes = self._get_links_and_nodes(machine, depth=0)
        links, nodes = set(links), set(nodes)
        links.add(machine.printname())
        nodes.add(machine.printname())
        self.links_nodes_cache[machine] = (links, nodes)
        return links, nodes

    def _get_links_and_nodes(self, machine, depth, exclude_links=False):
        name = machine.printname()
        if name.isupper() or name == "=AGT":
            links, nodes = [], []
        elif exclude_links:
            links, nodes = [], [name]
        else:
            links, nodes = [name], [name]

        # logging.info("{0}{1},{2}".format(depth*"    ", links, nodes))
        is_negated = False
        if machine in self.seen_for_links or depth > 5:
            return [], []
        self.seen_for_links.add(machine)
        for i, part in enumerate(machine.partitions):
            for hypernym in part:
                h_name = hypernym.printname()
                # logging.info("{0}h: {1}".format(depth*"    ", h_name))
                if h_name in ("lack", "not"):
                    is_negated = True
                    continue

                c_links, c_nodes = self._get_links_and_nodes(hypernym, depth=depth + 1, exclude_links=i != 0)

                if not h_name.isupper():
                    links += c_links
                nodes += c_nodes

        if not exclude_links:
            links += self.get_binary_links(machine)
        if is_negated:
            add_lack = (
                lambda link: "lack_{0}".format(link)
                if isinstance(link, unicode)
                else ("lack_{0}".format(link[0]), link[1])
            )  # nopep8
            links = map(add_lack, links)
            nodes = map(add_lack, nodes)

        return links, nodes

    def get_binary_links(self, machine):
        for parent, partition in machine.parents:
            parent_pn = parent.printname()
            # if not parent_pn.isupper() or partition == 0:
            if partition == 0:
                # haven't seen it yet but possible
                continue
            elif partition == 1:
                links = set([(parent_pn, other.printname()) for other in parent.partitions[2]])
            elif partition == 2:
                links = set([(other.printname(), parent_pn) for other in parent.partitions[1]])
            else:
                raise Exception("machine {0} has more than 3 partitions!".format(machine))
            for link in links:
                yield link

    def contains(self, links, machine):
        pn = machine.printname()
        for link in links:
            if link == pn or (pn in link and isinstance(link, tuple)):
                self.log('link "{0}" is/contains name "{1}"'.format(link, pn))
                return True
        else:
            return False
Example #34
0
class DepTo4lang():

    dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)")

    def __init__(self, cfg):
        self.cfg = cfg
        self.out_fn = self.cfg.get("machine", "ext_definitions")
        ensure_dir(os.path.dirname(self.out_fn))
        dep_map_fn = cfg.get("deps", "dep_map")
        self.read_dep_map(dep_map_fn)
        self.lemmatizer = Lemmatizer(cfg)

    def read_dep_map(self, dep_map_fn):
        self.dependencies = {}
        for line in file(dep_map_fn):
            l = line.strip()
            if not l or l.startswith('#'):
                continue
            dep = Dependency.create_from_line(l)
            self.dependencies[dep.name] = dep

    def apply_dep(self, dep_str, machine1, machine2):
        if dep_str not in self.dependencies:
            logging.warning(
                'skipping dependency not in dep_to_4lang map: {0}'.format(
                    dep_str))
            return False  # not that anyone cares
        self.dependencies[dep_str].apply(machine1, machine2)

    def dep_to_4lang(self):
        dict_fn = self.cfg.get("dict", "output_file")
        logging.info('reading dependencies from {0}...'.format(dict_fn))
        longman = json.load(open(dict_fn))
        self.words_to_machines = {}
        for c, (word, entry) in enumerate(longman.iteritems()):
            if c % 1000 == 0:
                logging.info("added {0}...".format(c))
            try:
                if entry["to_filter"]:
                    continue
                if not entry['senses']:
                    #  TODO these are words that only have pointers to an MWE
                    #  that they are part of.
                    continue
                definition = entry['senses'][0]['definition']
                if definition is None:
                    continue
                deps = definition['deps']
                if not deps:
                    #  TODO see previous comment
                    continue
                machine = self.get_dep_definition(word, deps)
                if machine is None:
                    continue
                self.words_to_machines[word] = machine
            except Exception:
                logging.error(
                    u'skipping "{0}" because of an exception:'.format(word))
                logging.info("entry: {0}".format(entry))
                traceback.print_exc()
                continue

        logging.info('done!')

    def print_graphs(self):
        print_4lang_graphs(self.words_to_machines,
                           self.cfg.get('machine', 'graph_dir'))

    def save_machines(self):
        logging.info('saving machines to {0}...'.format(self.out_fn))
        with open(self.out_fn, 'w') as out_file:
            cPickle.dump(self.words_to_machines, out_file)
        logging.info('done!')

    @staticmethod
    def parse_dependency(string):
        dep_match = DepTo4lang.dep_regex.match(string)
        if not dep_match:
            raise Exception('cannot parse dependency: {0}'.format(string))
        dep, word1, id1, word2, id2 = dep_match.groups()
        return dep, (word1, id1), (word2, id2)

    def get_dep_definition(self, word, deps):
        root_deps = filter(lambda d: d[0] == 'root', deps)
        if len(root_deps) != 1:
            logging.warning(
                u'no unique root dependency, skipping word "{0}"'.format(word))
            return None
        root_word, root_id = root_deps[0][2]
        root_lemma = self.lemmatizer.lemmatize(root_word).replace('/', '_PER_')
        root_lemma = root_word if not root_lemma else root_lemma

        word2machine = self.get_machines_from_parsed_deps(deps)

        root_machine = word2machine[root_lemma]
        word_machine = word2machine.get(word, Machine(word, ConceptControl()))
        word_machine.append(root_machine, 0)
        return word_machine

    def get_machines_from_deps(self, dep_strings):
        # deprecated, use get_machines_from_deps_and_corefs
        deps = map(DepTo4lang.parse_dependency, dep_strings)
        return self.get_machines_from_parsed_deps(deps)

    def get_machines_from_parsed_deps(self, deps):
        # deprecated, use get_machines_from_deps_and_corefs
        return self.get_machines_from_deps_and_corefs([deps], [])

    def get_machines_from_deps_and_corefs(self, dep_lists, corefs):
        coref_index = defaultdict(dict)
        for (word, sen_no), mentions in corefs:
            for m_word, m_sen_no in mentions:
                coref_index[m_word][m_sen_no - 1] = word

        # logging.info('coref index: {0}'.format(coref_index))

        lexicon = Lexicon()
        word2machine = {}

        for i, deps in enumerate(dep_lists):
            try:
                for dep, (word1, id1), (word2, id2) in deps:
                    # logging.info('w1: {0}, w2: {1}'.format(word1, word2))
                    c_word1 = coref_index[word1].get(i, word1)
                    c_word2 = coref_index[word2].get(i, word2)
                    """
                    if c_word1 != word1:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word1, c_word1))
                    if c_word2 != word2:
                        logging.warning(
                            "unifying '{0}' with canonical '{1}'".format(
                                word2, c_word2))
                    """

                    # logging.info(
                    #    'cw1: {0}, cw2: {1}'.format(c_word1, c_word2))
                    lemma1 = self.lemmatizer.lemmatize(c_word1)
                    lemma2 = self.lemmatizer.lemmatize(c_word2)

                    lemma1 = c_word1 if not lemma1 else lemma1
                    lemma2 = c_word2 if not lemma2 else lemma2

                    # TODO
                    lemma1 = lemma1.replace('/', '_PER_')
                    lemma2 = lemma2.replace('/', '_PER_')

                    # logging.info(
                    #     'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2))
                    machine1, machine2 = self._add_dependency(
                        dep, (lemma1, id1), (lemma2, id2), lexicon)

                    word2machine[lemma1] = machine1
                    word2machine[lemma2] = machine2
            except:
                logging.error("failure on dep: {0}({1}, {2})".format(
                    dep, word1, word2))
                traceback.print_exc()
                raise Exception("adding dependencies failed")

        return word2machine

    def _add_dependency(self, dep, (word1, id1), (word2, id2), lexicon):
        """Given a triplet from Stanford Dep.: D(w1,w2), we create and activate
        machines for w1 and w2, then run all operators associated with D on the
        sequence of the new machines (m1, m2)"""
        # logging.info(
        #     'adding dependency {0}({1}, {2})'.format(dep, word1, word2))
        machine1, machine2 = map(lexicon.get_machine, (word1, word2))

        self.apply_dep(dep, machine1, machine2)
        return machine1, machine2
Example #35
0
from lemmatizer import Lemmatizer

lemma = Lemmatizer()

print(lemma.lemmatize('bersetubuh'), lemma.lemmatize('berdansa'),
      lemma.lemmatize('penamaan'), lemma.lemmatize('berusaha'),
      lemma.lemmatize('berdansa'), lemma.lemmatize('bolak-balik'),
      lemma.lemmatize('gemetar'), lemma.lemmatize('petanggungjawaban'),
      lemma.lemmatize('kepastian'), lemma.lemmatize('berpendidikan'),
      lemma.lemmatize('berhubungan'), lemma.lemmatize('berwawasan'),
      lemma.lemmatize('pengetahuan'), lemma.lemmatize('pengembala'),
      lemma.lemmatize('penarikan'), lemma.lemmatize('terbengkalai'),
      lemma.lemmatize('rumahku'), lemma.lemmatize('penanggulangan'),
      lemma.lemmatize('perpecahan'), lemma.lemmatize('pemalas'),
      lemma.lemmatize('tertikunganlah'), lemma.lemmatize('perdamaian'),
      lemma.lemmatize('terbirit-birit'), lemma.lemmatize('cebokan'),
      lemma.lemmatize('mengotomatisasikan'), lemma.lemmatize('menyelesaikan'),
      lemma.lemmatize('sekawasan'), lemma.lemmatize('pengertian'),
      lemma.lemmatize('ketidakpastian'))
Example #36
0
File: app.py Project: pawel717/PJN

def indexCorpus():
    indexer = Indexer(database)
    # index normal articles
    indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles")
    indexer.compute_tf()
    indexer.compute_tf_idf()
    indexer.purge()
    # index lemmatized articles
    indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles_lemma")
    indexer.output_catalog = "./indexes_lemmatized/"
    indexer.compute_tf()
    indexer.compute_tf_idf()
    indexer.purge()


if __name__ == "__main__":
    if (len(sys.argv) > 2):
        _usage()

    lemmatizer = Lemmatizer()
    lemmatizer.makeDictionaryMap()

    if (len(sys.argv) == 2):
        if (sys.argv[1] == 'index'):
            database = Database()
            lemmatizeCorpus(lemmatizer)
            indexCorpus()

    app.run()