Esempio n. 1
0
    def process_entries(self, words):
        entry_preprocessor = EntryPreprocessor(self.cfg)
        entries = map(entry_preprocessor.preprocess_entry,
                      (self.raw_dict[word] for word in words))

        if self.lang == 'eng':
            stanford_wrapper = StanfordWrapper(self.cfg)
            entries = stanford_wrapper.parse_sentences(entries,
                                                       definitions=True)
        elif self.lang == 'hun':
            magyarlanc_wrapper = Magyarlanc(self.cfg)
            entries = magyarlanc_wrapper.parse_entries(entries)
        else:
            print 'incorrect lang'

        for entry in entries:
            if entry['to_filter']:
                continue
            word = entry['hw']
            for sense in entry['senses']:
                definition = sense['definition']
                if definition is None:
                    continue

            if word in self.dictionary:
                logging.warning(
                    "entries with identical headwords:\n{0}\n{1}".format(
                        entry, self.dictionary[word]))

                self.unify(self.dictionary[word], entry)
            else:
                self.dictionary[word] = entry
Esempio n. 2
0
    def process_entries(self, words):
        entry_preprocessor = EntryPreprocessor(self.cfg)
        entries = map(entry_preprocessor.preprocess_entry,
                      (self.raw_dict[word] for word in words))

        if self.lang == 'eng':
            stanford_wrapper = StanfordWrapper(self.cfg)
            entries = stanford_wrapper.parse_sentences(
                entries, definitions=True)
        elif self.lang == 'hun':
            magyarlanc_wrapper = Magyarlanc(self.cfg)
            entries = magyarlanc_wrapper.parse_entries(entries)
        else:
            print 'incorrect lang'

        for entry in entries:
            if entry['to_filter']:
                continue
            word = entry['hw']
            for sense in entry['senses']:
                definition = sense['definition']
                if definition is None:
                    continue

            if word in self.dictionary:
                logging.warning(
                    "entries with identical headwords:\n{0}\n{1}".format(
                        entry, self.dictionary[word]))

                self.unify(self.dictionary[word], entry)
            else:
                self.dictionary[word] = entry
Esempio n. 3
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.deps_dir = self.cfg.get('text', 'deps_dir')
     # self.machines_dir = self.cfg.get('text', 'machines_dir')
     self.graphs_dir = cfg.get('text', 'graph_dir')
     map(ensure_dir, (self.deps_dir, self.graphs_dir))  # self.machines_dir
     if self.lang == 'en':
         self.parser_wrapper = CoreNLPWrapper(self.cfg)
     elif self.lang == 'hu':
         self.parser_wrapper = Magyarlanc(self.cfg)
     self.dep_to_4lang = DepTo4lang(self.cfg)
Esempio n. 4
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.deps_dir = self.cfg.get('text', 'deps_dir')
     # self.machines_dir = self.cfg.get('text', 'machines_dir')
     self.graphs_dir = cfg.get('text', 'graph_dir')
     map(ensure_dir, (self.deps_dir, self.graphs_dir))  # self.machines_dir
     if self.lang == 'en':
         self.parser_wrapper = CoreNLPWrapper(self.cfg)
     elif self.lang == 'hu':
         self.parser_wrapper = Magyarlanc(self.cfg)
     self.dep_to_4lang = DepTo4lang(self.cfg)
Esempio n. 5
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if not direct_parse:
         self.deps_dir = self.cfg.get('text', 'deps_dir')
         ensure_dir(self.deps_dir)
     # self.machines_dir = self.cfg.get('text', 'machines_dir')
     self.graphs_dir = cfg.get('text', 'graph_dir')
     ensure_dir(self.graphs_dir)
     if self.lang == 'en':
         self.parser_wrapper = CoreNLPWrapper(self.cfg)
     elif self.lang == 'hu':
         self.parser_wrapper = Magyarlanc(self.cfg)
     self.dep_to_4lang = DepTo4lang(self.cfg, direct_parse)
Esempio n. 6
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if not direct_parse:
         self.deps_dir = self.cfg.get('text', 'deps_dir')
         ensure_dir(self.deps_dir)
     # self.machines_dir = self.cfg.get('text', 'machines_dir')
     self.graphs_dir = cfg.get('text', 'graph_dir')
     ensure_dir(self.graphs_dir)
     if self.lang == 'en':
         self.parser_wrapper = CoreNLPWrapper(self.cfg)
     elif self.lang == 'hu':
         self.parser_wrapper = Magyarlanc(self.cfg)
     self.dep_to_4lang = DepTo4lang(self.cfg, direct_parse)
Esempio n. 7
0
class TextTo4lang():
    square_regex = re.compile("\[.*?\]")

    def __init__(self, cfg):
        self.cfg = cfg
        self.lang = self.cfg.get("deps", "lang")
        self.deps_dir = self.cfg.get('text', 'deps_dir')
        # self.machines_dir = self.cfg.get('text', 'machines_dir')
        self.graphs_dir = cfg.get('text', 'graph_dir')
        map(ensure_dir, (self.deps_dir, self.graphs_dir))  # self.machines_dir
        if self.lang == 'en':
            self.parser_wrapper = CoreNLPWrapper(self.cfg)
        elif self.lang == 'hu':
            self.parser_wrapper = Magyarlanc(self.cfg)
        self.dep_to_4lang = DepTo4lang(self.cfg)

    @staticmethod
    def preprocess_text(text):
        t = text.strip()
        t = TextTo4lang.square_regex.sub('', t)
        t = t.replace(u"=", u"_eq_")
        t = t.replace(u"\xc2\xa0", u" ")
        t = t.replace(u"\xa0", u" ")
        t = t.strip()
        # if t != text:
        #   logging.debug(u"{0} -> {1}".format(text, t))
        return t

    def print_deps(self, parsed_sens, dep_dir=None, fn=None):
        for i, deps in enumerate(parsed_sens):
            if fn is None:
                out_fn = os.path.join(dep_dir, "{0}.dep".format(i))
            else:
                out_fn = os.path.join(dep_dir, "{0}_{1}.dep".format(fn, i))
            with open(out_fn, 'w') as f:
                f.write(
                    "\n".join(["{0}({1}, {2})".format(*dep) for dep in deps]))

    def process(self):
        input_path = self.cfg.get('text', 'input_sens')
        if os.path.isdir(input_path):
            file_names = [
                os.path.join(input_path, fn) for fn in os.listdir(input_path)]
        else:
            file_names = [input_path]
        logging.info('will process {0} file(s)'.format(len(file_names)))
        map(self.process_file, file_names)

    def process_file(self, fn):
        base_fn = os.path.basename(fn)
        deps_fn = os.path.join(self.deps_dir, "{0}.deps".format(base_fn))
        # machines_fn = os.path.join(
        #     self.machines_dir, "{0}.machines".format(base_fn))
        if not os.path.exists(deps_fn):
            self.parse_file(fn, deps_fn)
        else:
            logging.info('file exists: {0}, not parsing'.format(deps_fn))

        # TODO also support dumping machines to file
        # logging.getLogger().setLevel(__MACHINE_LOGLEVEL__)

        if not self.cfg.getboolean('text', 'parse_only'):
            self.process_deps(deps_fn)

    def parse_file(self, fn, out_fn):
        logging.info("parsing file: {0}".format(fn))
        preproc_sens = []
        for line in open(fn):
            if not line:
                continue
            preproc_sens.append(TextTo4lang.preprocess_text(
                line.strip().decode('utf-8')))
        deps, corefs = self.parser_wrapper.parse_text("\n".join(preproc_sens))
        with open(out_fn, 'w') as out_f:
            out_f.write("{0}\n".format(json.dumps({
                "deps": deps,
                "corefs": corefs})))
        logging.info("parsed {0} sentences".format(len(deps)))

    def process_deps(self, fn):
        sen_machines = []
        c = 0
        for line in open(fn):
            data = json.loads(line)
            deps, corefs = data['deps'], data['corefs']
            for sen_deps in deps:
                # logging.info("processing sentences...")
                machines = self.dep_to_4lang.get_machines_from_deps_and_corefs(
                    [sen_deps], corefs)
                if self.cfg.getboolean('text', 'expand'):
                    self.dep_to_4lang.lexicon.expand(machines)

                if self.cfg.getboolean('text', 'print_graphs'):
                    fn = print_text_graph(machines, self.graphs_dir, fn=c)

                sen_machines.append(machines)
                c += 1

        return sen_machines

    @staticmethod
    def delete_connection(m1, m2):
        for part in range(len(m1.partitions)):
            if m2 in m1.partitions[part]:
                m1.remove(m2, part)
                return part
        # ipdb.set_trace()
        return None
Esempio n. 8
0
class TextTo4lang():
    square_regex = re.compile("\[.*?\]")

    def __init__(self, cfg, direct_parse=False):
        self.cfg = cfg
        self.lang = self.cfg.get("deps", "lang")
        if not direct_parse:
            self.deps_dir = self.cfg.get('text', 'deps_dir')
            ensure_dir(self.deps_dir)
        # self.machines_dir = self.cfg.get('text', 'machines_dir')
        self.graphs_dir = cfg.get('text', 'graph_dir')
        ensure_dir(self.graphs_dir)
        if self.lang == 'en':
            self.parser_wrapper = CoreNLPWrapper(self.cfg)
        elif self.lang == 'hu':
            self.parser_wrapper = Magyarlanc(self.cfg)
        self.dep_to_4lang = DepTo4lang(self.cfg, direct_parse)

    @staticmethod
    def preprocess_text(text):
        t = text.strip()
        t = TextTo4lang.square_regex.sub('', t)
        t = t.replace(u"=", u"_eq_")
        t = t.replace(u"\xc2\xa0", u" ")
        t = t.replace(u"\xa0", u" ")
        t = t.strip()
        # if t != text:
        #   logging.debug(u"{0} -> {1}".format(text, t))
        return t

    def print_deps(self, parsed_sens, dep_dir=None, fn=None):
        for i, deps in enumerate(parsed_sens):
            if fn is None:
                out_fn = os.path.join(dep_dir, "{0}.dep".format(i))
            else:
                out_fn = os.path.join(dep_dir, "{0}_{1}.dep".format(fn, i))
            with open(out_fn, 'w') as f:
                f.write("\n".join(
                    ["{0}({1}, {2})".format(*dep) for dep in deps]))

    def process(self):
        input_path = self.cfg.get('text', 'input_sens')
        if os.path.isdir(input_path):
            file_names = [
                os.path.join(input_path, fn) for fn in os.listdir(input_path)
            ]
        else:
            file_names = [input_path]
        logging.info('will process {0} file(s)'.format(len(file_names)))
        map(self.process_file, file_names)

    def process_file(self, fn):
        base_fn = os.path.basename(fn)
        deps_fn = os.path.join(self.deps_dir, "{0}.deps".format(base_fn))
        # machines_fn = os.path.join(
        #     self.machines_dir, "{0}.machines".format(base_fn))
        if not os.path.exists(deps_fn):
            self.parse_file(fn, deps_fn)
        else:
            logging.info('file exists: {0}, not parsing'.format(deps_fn))

        # TODO also support dumping machines to file
        # logging.getLogger().setLevel(__MACHINE_LOGLEVEL__)

        if not self.cfg.getboolean('text', 'parse_only'):
            self.process_deps(deps_fn)

    def parse_file(self, fn, out_fn):
        logging.info("parsing file: {0}".format(fn))
        preproc_sens = []
        for line in open(fn):
            if not line:
                continue
            preproc_sens.append(
                TextTo4lang.preprocess_text(line.strip().decode('utf-8')))
        deps, corefs, parse_trees = self.parser_wrapper.parse_text(
            "\n".join(preproc_sens))
        parse_tree_fn = out_fn.split('.')[0] + '_parse_trees.txt'
        with open(parse_tree_fn, 'w') as out_f:
            for sen, parse_tree in zip(preproc_sens, parse_trees):
                out_f.write(u"{0}\t{1}\n".format(sen,
                                                 parse_tree).encode('utf-8'))
        with open(out_fn, 'w') as out_f:
            out_f.write("{0}\n".format(
                json.dumps({
                    "deps": deps,
                    "corefs": corefs
                })))
        logging.info("parsed {0} sentences".format(len(deps)))

    def process_deps(self, fn):
        sen_machines = []
        c = 0
        for line in open(fn):
            data = json.loads(line)
            deps, corefs = data['deps'], data['corefs']
            for sen_deps in deps:
                # logging.info("processing sentences...")
                machines = self.dep_to_4lang.get_machines_from_deps_and_corefs(
                    [sen_deps], corefs)
                if self.cfg.getboolean('text', 'expand'):
                    self.dep_to_4lang.lexicon.expand(machines)

                if self.cfg.getboolean('text', 'print_graphs'):
                    fn = print_text_graph(machines, self.graphs_dir, fn=c)

                sen_machines.append(machines)
                c += 1

        return sen_machines

    @staticmethod
    def delete_connection(m1, m2):
        for part in range(len(m1.partitions)):
            if m2 in m1.partitions[part]:
                m1.remove(m2, part)
                return part
        # ipdb.set_trace()
        return None

    def process_phrase(self, phrase):
        preproc_sens = []
        preproc_sens.append(
            TextTo4lang.preprocess_text(phrase.strip().decode('utf-8')))
        deps, corefs, _ = self.parser_wrapper.parse_text(
            "\n".join(preproc_sens))
        machine = self.dep_to_4lang.get_machines_from_deps_and_corefs(
            [deps[0]], corefs)
        if self.cfg.getboolean('text', 'expand'):
            self.dep_to_4lang.lexicon.expand(machine)

        file_name = phrase.replace(' ', '_')
        file_name = file_name.replace('.', '')

        if self.cfg.getboolean('text', 'print_graphs'):
            fn = print_text_graph(machine, self.graphs_dir, fn=file_name)
        return machine