コード例 #1
0
    def pull(self, dest, download):
        print('Pulling from w00 dataset...')
        if download:
            f_path = os.path.join(dest, 'w00.zip')
            with open(f_path, 'wb') as f_out:
                f_out.write(urllib.request.urlopen(self._LINK).read())
            with zipfile.ZipFile(f_path) as zipf:
                zipf.extractall(dest)

        if util.tsv_already_exist(dest,
                                  [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]):
            return

        source_word = os.path.join(dest, self._SOURCE_WORD)
        source_meta = os.path.join(dest, self._SOURCE_META)
        assert os.path.exists(source_word)
        assert os.path.exists(source_meta)

        f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE)
        f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE)
        for line, meta in zip(open(source_word), open(source_meta)):
            line = line.strip()
            if not line:
                continue
            line = ' '.join(util.tokenize(line))
            def_flag = 1 if meta.startswith('1') else 0
            if def_flag == 1:
                util.save_output(f_out_def_path, line, def_flag, self.KEY)
            else:
                util.save_output(f_out_nodef_path, line, def_flag, self.KEY)

        print('\tDONE\n')
        return
コード例 #2
0
ファイル: msresearch.py プロジェクト: linnal/DeON
    def pull(self, dest, download):
        print('Pulling from msresearch dataset...')
        f_path = os.path.join(dest, 'msresearch.txt')
        if download:
            with open(f_path, 'wb') as f_out:
                f_out.write(urllib.request.urlopen(self._LINK).read())

        if util.tsv_already_exist(dest,
                                  [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]):
            return

        source = open(f_path)
        f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE)
        f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE)
        for line in source:
            line = line.strip()
            if not line:
                continue

            is_def, phrase = line.split('/', 1)
            def_flag = is_def == 'DEF'
            _def = 1 if def_flag else 0
            f_out_path = f_out_nodef_path
            topic, pos = self._extract_topic_pos(phrase)
            if _def:
                f_out_path = f_out_def_path
            phrase = ' '.join(util.tokenize(phrase))
            util.save_output(f_out_path, phrase, _def, self.KEY, topic, pos)

        print('\tDONE\n')
        return
コード例 #3
0
    def pull(self, dest, download):
        print('Pulling from wcl dataset...')
        if download:
            f_path = os.path.join(dest, 'wcl.tar.gz')
            with open(f_path, 'wb') as f_out:
                f_out.write(urllib.request.urlopen(self._LINK).read())
            with tarfile.open(f_path, 'r:gz') as targz:
                targz.extractall(dest)

        if util.tsv_already_exist(dest,
                                  [self._OUT_DEF_FILE, self._OUT_NODEF_FILE]):
            return

        source_uwak = os.path.join(dest, self._SOURCE_UKWAC)
        source_good = os.path.join(dest, self._SOURCE_WIKI_GOOD)
        source_bad = os.path.join(dest, self._SOURCE_WIKI_BAD)

        sources = [(source_uwak, True), (source_good, True),
                   (source_bad, False)]
        for source, _ in sources:
            assert os.path.exists(source)

        f_out_def_path = os.path.join(dest, self._OUT_DEF_FILE)
        f_out_nodef_path = os.path.join(dest, self._OUT_NODEF_FILE)
        for source, _def in sources:
            prevLine = ''
            for i, line in enumerate(open(source)):
                line = line.replace('\t', '')
                line = line.strip('! #\n')
                if not line:
                    continue
                if i % 2 == 0:
                    prevLine = line
                    continue

                subject, _ = line.split(':', maxsplit=1)
                phrase = prevLine.replace('TARGET', subject)
                phrase = ' '.join(util.tokenize(phrase))
                is_def = 1 if _def else 0
                if is_def == 1:
                    pos = util.topic_position(subject, phrase)
                    util.save_output(f_out_def_path,
                                     phrase,
                                     is_def,
                                     self.KEY,
                                     topic=subject,
                                     topic_pos=pos)
                else:
                    util.save_output(f_out_nodef_path,
                                     phrase,
                                     is_def,
                                     self.KEY,
                                     topic=subject)

        print('\tDONE\n')
        return
コード例 #4
0
ファイル: dictionary.py プロジェクト: linnal/DeON
    def pull(self, dest, download):
        print('Pulling for dictionary dataset...')
        self.wcl_process = util.start_wcl_process()
        self.dest = dest

        folder_path = '{}/{}'.format(dest, self._PAGE_FOLDER)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        if download:
            self._save_locally(folder_path)

        if util.tsv_already_exist(dest, self._WCL_OUT_FILE):
            return

        self._extract_topics_definitions_from(folder_path)
        return
コード例 #5
0
ファイル: difference_between.py プロジェクト: linnal/DeON
    def pull(self, dest, download):
        print('Pulling from diffbetween dataset...')
        self.dest = dest
        self.f_out_path = os.path.join(dest, self._OUT_FILES[0])
        folder_path = '{}/{}'.format(dest, self._PAGE_FOLDER)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        if download:
            downloader = Downloader(folder_path)
            downloader.save_locally()
            print('\n')

        if util.tsv_already_exist(dest, self._OUT_FILES):
            return

        self._extract_from(folder_path)
        print('\n\tDONE\n')
        return
コード例 #6
0
    def pull(self, dest, download):
        print('Pulling from wikipedia dataset...\n')

        if util.tsv_already_exist(dest, self._OUT_FILES):
            return

        self.dest = dest
        wiki_folder = os.path.join(dest, self._INPUT_FOLDER)
        classifier = TxtClassifier()

        content_folders = os.listdir(wiki_folder)
        for i, folder in enumerate(content_folders):
            content_folder_path = os.path.join(wiki_folder, folder)
            content_files = os.listdir(content_folder_path)
            for j, file_name in enumerate(content_files):
                progress = '{} [{}/{}]'.format(i, j, len(content_files))
                util.print_progress('Extracting def/nodef ', progress,
                                    len(content_folders))
                file = os.path.join(content_folder_path, file_name)
                self._parse(file, classifier)
        return