Ejemplo n.º 1
0
 def get_parser_and_lang(self):
     input_type = self.cfg.get('dict', 'input_type')
     logging.info('input type: {0}'.format(input_type))
     if input_type == 'wiktionary':
         self.parser = WiktParser()
         self.lang = 'eng'
     elif input_type == 'longman':
         self.parser = LongmanParser()
         self.lang = 'eng'
     elif input_type == 'collins':
         self.parser = CollinsParser()
         self.lang = 'eng'
     elif input_type == 'eksz':
         self.parser = EkszParser()
         self.lang = 'hun'
     elif input_type == 'nszt':
         self.parser = NSzTParser()
         self.lang = 'hun'
     else:
         raise Exception('unknown input format: {0}'.format(input_type))
Ejemplo n.º 2
0
 def get_parser_and_lang(self):
     input_type = self.cfg.get('dict', 'input_type')
     logging.info('input type: {0}'.format(input_type))
     if input_type == 'wiktionary':
         self.parser = WiktParser()
         self.lang = 'eng'
     elif input_type == 'longman':
         self.parser = LongmanParser()
         self.lang = 'eng'
     elif input_type == 'collins':
         self.parser = CollinsParser()
         self.lang = 'eng'
     elif input_type == 'eksz':
         self.parser = EkszParser()
         self.lang = 'hun'
     elif input_type == 'nszt':
         self.parser = NSzTParser()
         self.lang = 'hun'
     else:
         raise Exception('unknown input format: {0}'.format(input_type))
Ejemplo n.º 3
0
class DictTo4lang():
    def __init__(self, cfg):
        self.dictionary = {}
        self.cfg = cfg
        self.output_fn = self.cfg.get('dict', 'output_file')
        ensure_dir(os.path.dirname(self.output_fn))
        self.tmp_dir = self.cfg.get('data', 'tmp_dir')
        ensure_dir(self.tmp_dir)
        self.graph_dir = self.cfg.get('machine', 'graph_dir')
        ensure_dir(self.graph_dir)
        self.get_parser_and_lang()
        self.machine_wrapper = None

    def get_parser_and_lang(self):
        input_type = self.cfg.get('dict', 'input_type')
        logging.info('input type: {0}'.format(input_type))
        if input_type == 'wiktionary':
            self.parser = WiktParser()
            self.lang = 'eng'
        elif input_type == 'longman':
            self.parser = LongmanParser()
            self.lang = 'eng'
        elif input_type == 'collins':
            self.parser = CollinsParser()
            self.lang = 'eng'
        elif input_type == 'eksz':
            self.parser = EkszParser()
            self.lang = 'hun'
        elif input_type == 'nszt':
            self.parser = NSzTParser()
            self.lang = 'hun'
        else:
            raise Exception('unknown input format: {0}'.format(input_type))

    def parse_dict(self):
        input_file = self.cfg.get('dict', 'input_file')
        self.raw_dict = defaultdict(dict)
        for entry in self.parser.parse_file(input_file):
            if 'senses' not in entry or entry['senses'] == []:
                continue  # todo
            self.unify(self.raw_dict[entry['hw']], entry)

    def unify(self, entry1, entry2):
        if entry1 == {}:
            entry1.update(entry2)
        elif entry1['hw'] != entry2['hw']:
            raise Exception("cannot unify entries with different headwords: " +
                            "{0} vs. {1}".format(entry1['hw'], entry2['hw']))

        # print 'entry1: ' + repr(entry1)
        # print 'entry2: ' + repr(entry2)
        entry1['senses'] += entry2['senses']

    def process_entries(self, words):
        entry_preprocessor = EntryPreprocessor(self.cfg)
        entries = map(entry_preprocessor.preprocess_entry,
                      (self.raw_dict[word] for word in words))

        if self.lang == 'eng':
            stanford_wrapper = StanfordWrapper(self.cfg)
            entries = stanford_wrapper.parse_sentences(entries,
                                                       definitions=True)
        elif self.lang == 'hun':
            magyarlanc_wrapper = Magyarlanc(self.cfg)
            entries = magyarlanc_wrapper.parse_entries(entries)
        else:
            print 'incorrect lang'

        for entry in entries:
            if entry['to_filter']:
                continue
            word = entry['hw']
            for sense in entry['senses']:
                definition = sense['definition']
                if definition is None:
                    continue

            if word in self.dictionary:
                logging.warning(
                    "entries with identical headwords:\n{0}\n{1}".format(
                        entry, self.dictionary[word]))

                self.unify(self.dictionary[word], entry)
            else:
                self.dictionary[word] = entry

    def process_entries_thread(self, i, words):
        try:
            self.process_entries(words)
        except:
            self.thread_states[i] = False
            traceback.print_exc()
        else:
            self.thread_states[i] = True

    def run(self, no_threads=1):
        logging.info('parsing xml...')
        self.parse_dict()
        # print "\n".join(["\n".join(["{0}\t{1}".format(
        #                       w, d['definition']) for d in s['senses']])
        #                  for w, s in self.raw_dict.items()])
        # print self.raw_dict
        # sys.exit(-1)
        entries_per_thread = (len(self.raw_dict) / no_threads) + 1
        self.thread_states = {}
        # may turn out to be less then "no_threads" with small input
        started_threads = 0
        if ONE_BY_ONE:
            logging.warning('running threads one by one!')
        for i, batch in enumerate(
                batches(self.raw_dict.keys(), entries_per_thread)):

            if ONE_BY_ONE:
                logging.warning('running batch #{0}'.format(i))
                self.process_entries_thread(i, batch)
            else:
                t = threading.Thread(target=self.process_entries_thread,
                                     args=(i, batch))
                t.start()
            started_threads += 1
        logging.info("started {0} threads".format(started_threads))
        while True:
            if len(self.thread_states) < started_threads:
                time.sleep(1)
                continue
            elif all(self.thread_states.values()):
                logging.info(
                    "{0} threads finished successfully".format(no_threads))
                break
            else:
                raise Exception("some threads failed")

    def read_dict(self):
        logging.info(
            'loading dict_to_4lang intermediate state from {0}'.format(
                self.output_fn))
        with open(self.output_fn, 'r') as dict_file:
            self.dictionary = json.load(dict_file)
        logging.info('done!')

    def print_dict(self, stream=None):
        if stream is None:
            with open(self.output_fn, 'w') as out:
                json.dump(self.dictionary, out)
        else:
            json.dump(self.dictionary, stream)
Ejemplo n.º 4
0
class DictTo4lang():
    def __init__(self, cfg):
        self.dictionary = {}
        self.cfg = cfg
        self.output_fn = self.cfg.get('dict', 'output_file')
        ensure_dir(os.path.dirname(self.output_fn))
        self.tmp_dir = self.cfg.get('data', 'tmp_dir')
        ensure_dir(self.tmp_dir)
        self.graph_dir = self.cfg.get('machine', 'graph_dir')
        ensure_dir(self.graph_dir)
        self.get_parser_and_lang()
        self.machine_wrapper = None

    def get_parser_and_lang(self):
        input_type = self.cfg.get('dict', 'input_type')
        logging.info('input type: {0}'.format(input_type))
        if input_type == 'wiktionary':
            self.parser = WiktParser()
            self.lang = 'eng'
        elif input_type == 'longman':
            self.parser = LongmanParser()
            self.lang = 'eng'
        elif input_type == 'collins':
            self.parser = CollinsParser()
            self.lang = 'eng'
        elif input_type == 'eksz':
            self.parser = EkszParser()
            self.lang = 'hun'
        elif input_type == 'nszt':
            self.parser = NSzTParser()
            self.lang = 'hun'
        else:
            raise Exception('unknown input format: {0}'.format(input_type))

    def parse_dict(self):
        input_file = self.cfg.get('dict', 'input_file')
        self.raw_dict = defaultdict(dict)
        for entry in self.parser.parse_file(input_file):
            if 'senses' not in entry or entry['senses'] == []:
                continue  # todo
            self.unify(self.raw_dict[entry['hw']], entry)

    def unify(self, entry1, entry2):
        if entry1 == {}:
            entry1.update(entry2)
        elif entry1['hw'] != entry2['hw']:
            raise Exception(
                "cannot unify entries with different headwords: " +
                "{0} vs. {1}".format(entry1['hw'], entry2['hw']))

        # print 'entry1: ' + repr(entry1)
        # print 'entry2: ' + repr(entry2)
        else:
            entry1['senses'] += entry2['senses']

    def process_entries(self, words):
        entry_preprocessor = EntryPreprocessor(self.cfg)
        entries = map(entry_preprocessor.preprocess_entry,
                      (self.raw_dict[word] for word in words))

        if self.lang == 'eng':
            stanford_wrapper = StanfordWrapper(self.cfg)
            entries = stanford_wrapper.parse_sentences(
                entries, definitions=True)
        elif self.lang == 'hun':
            magyarlanc_wrapper = Magyarlanc(self.cfg)
            entries = magyarlanc_wrapper.parse_entries(entries)
        else:
            print 'incorrect lang'

        for entry in entries:
            if entry['to_filter']:
                continue
            word = entry['hw']
            for sense in entry['senses']:
                definition = sense['definition']
                if definition is None:
                    continue

            if word in self.dictionary:
                logging.warning(
                    "entries with identical headwords:\n{0}\n{1}".format(
                        entry, self.dictionary[word]))

                self.unify(self.dictionary[word], entry)
            else:
                self.dictionary[word] = entry

    def process_entries_thread(self, i, words):
        try:
            self.process_entries(words)
        except:
            self.thread_states[i] = False
            traceback.print_exc()
        else:
            self.thread_states[i] = True

    def run(self, no_threads=1):
        logging.info('parsing xml...')
        self.parse_dict()
        # print "\n".join(["\n".join(["{0}\t{1}".format(
        #                       w, d['definition']) for d in s['senses']])
        #                  for w, s in self.raw_dict.items()])
        # print self.raw_dict
        # sys.exit(-1)
        entries_per_thread = (len(self.raw_dict) / no_threads) + 1
        self.thread_states = {}
        # may turn out to be less then "no_threads" with small input
        started_threads = 0
        if ONE_BY_ONE:
            logging.warning('running threads one by one!')
        for i, batch in enumerate(batches(self.raw_dict.keys(),
                                  entries_per_thread)):

            if ONE_BY_ONE:
                logging.warning('running batch #{0}'.format(i))
                self.process_entries_thread(i, batch)
            else:
                t = threading.Thread(
                    target=self.process_entries_thread, args=(i, batch))
                t.start()
            started_threads += 1
        logging.info("started {0} threads".format(started_threads))
        while True:
            if len(self.thread_states) < started_threads:
                time.sleep(1)
                continue
            elif all(self.thread_states.values()):
                logging.info(
                    "{0} threads finished successfully".format(no_threads))
                break
            else:
                raise Exception("some threads failed")

    def read_dict(self):
        logging.info(
            'loading dict_to_4lang intermediate state from {0}'.format(
                self.output_fn))
        with open(self.output_fn, 'r') as dict_file:
            self.dictionary = json.load(dict_file)
        logging.info('done!')

    def print_dict(self, stream=None):
        if stream is None:
            with open(self.output_fn, 'w') as out:
                json.dump(self.dictionary, out)
        else:
            json.dump(self.dictionary, stream)