コード例 #1
0
 def classify(self, txt, topics):
     sentences = sent_tokenize(txt)
     for sentence in sentences:
         if self._ignore(sentence):
             continue
         extract_topics = NPExtractor(sentence).extract()
         _topics = list(set(topics).union(set(extract_topics)))
         _topics = [x for x in _topics if len(x.split()) < 5]
         if len(_topics) == 0:
             continue
         is_classified = False
         for topic in _topics:
             if not topic.isalnum():
                 continue
             classifier = self._classify(sentence, topic)
             if not classifier:
                 continue
             sentence = ' '.join(util.tokenize(sentence))
             yield classifier, topic, sentence
             is_classified = True
             break
         if not is_classified and self._is_no_def(sentence):
             topic = _topics[0]
             if _topics[0] in sentence:
                 topic = _topics[0]
             elif len(_topics) > 1:
                 topic = _topics[1]
             sentence = ' '.join(util.tokenize(sentence))
             yield 'nodef', topic, sentence
コード例 #2
0
    def pull(self, dest, download):
        print('Pulling from w00 dataset...')
        if download:
            f_path = os.path.join(dest, 'w00.zip')
            with open(f_path, 'wb') as f_out:
                f_out.write(urllib.request.urlopen(self._LINK).read())
            with zipfile.ZipFile(f_path) as zipf:
                zipf.extractall(dest)

        if util.tsv_already_exist(dest,
                                  [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]):
            return

        source_word = os.path.join(dest, self._SOURCE_WORD)
        source_meta = os.path.join(dest, self._SOURCE_META)
        assert os.path.exists(source_word)
        assert os.path.exists(source_meta)

        f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE)
        f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE)
        for line, meta in zip(open(source_word), open(source_meta)):
            line = line.strip()
            if not line:
                continue
            line = ' '.join(util.tokenize(line))
            def_flag = 1 if meta.startswith('1') else 0
            if def_flag == 1:
                util.save_output(f_out_def_path, line, def_flag, self.KEY)
            else:
                util.save_output(f_out_nodef_path, line, def_flag, self.KEY)

        print('\tDONE\n')
        return
コード例 #3
0
ファイル: msresearch.py プロジェクト: linnal/DeON
    def pull(self, dest, download):
        print('Pulling from msresearch dataset...')
        f_path = os.path.join(dest, 'msresearch.txt')
        if download:
            with open(f_path, 'wb') as f_out:
                f_out.write(urllib.request.urlopen(self._LINK).read())

        if util.tsv_already_exist(dest,
                                  [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]):
            return

        source = open(f_path)
        f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE)
        f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE)
        for line in source:
            line = line.strip()
            if not line:
                continue

            is_def, phrase = line.split('/', 1)
            def_flag = is_def == 'DEF'
            _def = 1 if def_flag else 0
            f_out_path = f_out_nodef_path
            topic, pos = self._extract_topic_pos(phrase)
            if _def:
                f_out_path = f_out_def_path
            phrase = ' '.join(util.tokenize(phrase))
            util.save_output(f_out_path, phrase, _def, self.KEY, topic, pos)

        print('\tDONE\n')
        return
コード例 #4
0
 def _classify(self, sentence, topic):
     _sentence = ' '.join(util.tokenize(sentence.lower()))
     _topic = topic.lower()
     if self._is_anafora(_sentence):
         return 'anafora'
     if self._is_def(_sentence, _topic):
         return 'def'
     return
コード例 #5
0
    def pull(self, dest, download):
        print('Pulling from wcl dataset...')
        if download:
            f_path = os.path.join(dest, 'wcl.tar.gz')
            with open(f_path, 'wb') as f_out:
                f_out.write(urllib.request.urlopen(self._LINK).read())
            with tarfile.open(f_path, 'r:gz') as targz:
                targz.extractall(dest)

        if util.tsv_already_exist(dest,
                                  [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]):
            return

        source_uwak = os.path.join(dest, self._SOURCE_UKWAC)
        source_good = os.path.join(dest, self._SOURCE_WIKI_GOOD)
        source_bad = os.path.join(dest, self._SOURCE_WIKI_BAD)

        sources = [(source_uwak, True), (source_good, True),
                   (source_bad, False)]
        for source, _ in sources:
            assert os.path.exists(source)

        f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE)
        f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE)
        for source, _def in sources:
            prevLine = ''
            for i, line in enumerate(open(source)):
                line = line.replace('\t', '')
                line = line.strip('! #\n')
                if not line:
                    continue
                if i % 2 == 0:
                    prevLine = line
                    continue

                subject, _ = line.split(':', maxsplit=1)
                phrase = prevLine.replace('TARGET', subject)
                phrase = ' '.join(util.tokenize(phrase))
                is_def = 1 if _def else 0
                if is_def == 1:
                    pos = util.topic_position(subject, phrase)
                    util.save_output(f_out_def_path,
                                     phrase,
                                     is_def,
                                     self.KEY,
                                     topic=subject,
                                     topic_pos=pos)
                else:
                    util.save_output(f_out_nodef_path,
                                     phrase,
                                     is_def,
                                     self.KEY,
                                     topic=subject)

        print('\tDONE\n')
        return
コード例 #6
0
ファイル: difference_between.py プロジェクト: linnal/DeON
    def _extract_topics_definitions_from(self, article, topics):
        result = set()
        definitions = DifferenceBetween(article, topics).extractDefinitions()

        for topic in topics:
            for _def in definitions.values():
                if _def is None:
                    continue
                lower_def = _def.lower()

                if re.match(r"^((a)|(an) )?{} ((is)|(are)).+".format(topic), lower_def):
                    result.add((topic, ' '.join(util.tokenize(_def))))
                    break
        return result