def __init__(self, cfg): self.cfg = cfg self.out_fn = self.cfg.get("machine", "ext_definitions") ensure_dir(os.path.dirname(self.out_fn)) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.lemmatizer = Lemmatizer(cfg)
def prepare_articles(articles, from_cache=False): texts = [] lemmatizer = Lemmatizer() german_stop_words = stopwords.words('german') filename = "data/lda-trainingdata.pickle" if from_cache: with open(filename, 'rb') as file: texts = pickle.load(file) return texts else: # Remove '... [+ xxx chars]' pattern from 'content' for article in progressbar(articles): article_text = "" for text in [article.description, article.title, article.fulltext if article.fulltext else article.content]: if text: text = re.sub('\[.*?\]', '', text) text = " ".join([x for x in text.split() if x.isalnum() or '.' in x]) article_text += lemmatizer.lemmatize_text(text=text, verbose=False) article_text = [x for x in article_text.split() if x not in german_stop_words] texts.append(article_text) # Cache lda-trainingdata if not os.path.exists("data"): os.makedirs("data") with open(filename, 'wb') as file: pickle.dump(texts, file) return texts
def __init__(self, cfg): try: self.batch = cfg.getboolean('similarity_machine', 'batch') except NoSectionError: self.batch = False self.cfg = cfg self.lemmatizer = Lemmatizer(cfg) self.machine_wrapper = MachineWrapper(cfg) self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english'))
def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {}
def add_string(self, string, encoded_from=None, train=None): """Add string to mapping and return id and optionally character id. Arguments: string: string. encoded_from: string, only used if is_encoded == True or is_encoded == None (undecided). train: Train mapping. If given, the words and alphabets are reused from the train mapping. Returns: If characters are allowed, a tuple (string id, character id). Otherwise only string id. """ # Store strings when is_encoded == None if self.is_encoded is None: self.strings_original.add(string) # Encode string with lemma rule if self.is_encoded == None or self.is_encoded == True: # Do not encode special labels if not string in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]: encoded_string = Lemmatizer.gen_absolute_lemma_rule(encoded_from, string) if encoded_string in self.strings_map: string = encoded_string else: string = Lemmatizer.gen_lemma_rule(encoded_from, string) # Word-level information if string not in self.strings_map: if train: string = '<unk>' else: self.strings_map[string] = len(self.strings) self.strings.append(string) if self._include_characters: # Character-level information if string not in self.charseqs_map: self.charseqs_map[string] = len(self.charseqs) self.charseqs.append([]) for c in string: if c not in self.alphabet_map: if train: c = '<unk>' else: self.alphabet_map[c] = len(self.alphabet) self.alphabet.append(c) self.charseqs[-1].append(self.alphabet_map[c]) return (self.strings_map[string], self.charseqs_map[string]) if self._include_characters else self.strings_map[string]
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if (not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_only = cfg.getboolean('filter', 'first_only')
def parse_batch(article: WikiArticle, lemmatizer: Lemmatizer = self.l) -> List: # noqa data = [] for idx, (lemmas, tokens, sentence) in enumerate( lemmatizer.lemmatize(article.text)): data.append("{}\t{}\t{}\t{}\n".format(article.title_id, idx, sentence, " ".join(lemmas))) return data
def preprocessing(self, text, lang): ''' melakukan tokenisasi text menjadi beberapa kata dan kalimat ''' self.stop_words = stopwords.words(lang) + list(punctuation) if lang == 'indonesian': self.lmm = Lemmatizer() elif lang == 'english': self.lmm = WordNetLemmatizer() self.tokenized_sent = list(set(sent_tokenize(text)))
def __init__(self, cfg, cfg_section='word_sim'): self.batch = cfg.getboolean(cfg_section, 'batch') logging.warning("fourlangpath is {0}".format( cfg.get(cfg_section, 'fourlangpath'))) self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) self.sim_feats = SimFeatures(cfg, cfg_section) self.expand = cfg.getboolean(cfg_section, "expand") logging.info("expand is {0}".format(self.expand))
def from_absolute_encodings(mapping, is_encoded): assert mapping.is_encoded is None or mapping.is_encoded == True assert is_encoded == True new = Mapping(include_characters=mapping._include_characters, is_encoded=True, train=None) for key in mapping.strings_map: if not key in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]: if Lemmatizer.is_absolute_lemma_rule(key): new.strings_map[key] = len(new.strings) new.strings.append(key) return new
def parse_sentence(self, s): keywords = [] # Lemmatize sentence and only keep verbs, nouns, dates and PTs l = Lemmatizer() lemmas = l.lemmatize(s) lemmas = l.filter(lemmas, ['V', 'N', 'W', 'PT']) # Normalize lemmas for l in lemmas: if l['tag'] == 'W': norm_lemma = l['lemma'] else: norm_lemma = self.normalize(l['lemma']) if len(norm_lemma) > 0 and norm_lemma not in ignore_lemmas: keywords.append(norm_lemma) self.vprint("Keywords: ", keywords) return [self.crawler.getwordid(word) for word in keywords]
def id_to_string(self, string_id, encoded_from=None, train=None): """Returns decoded string from int id.""" string = self.strings[string_id] # Special strings are not encoded if string in ["<pad>", "<unk>", "<none>", "<root>", "<anchor>"]: return string else: # Decode string with lemma rule if self.is_encoded == None or self.is_encoded == True: return Lemmatizer.apply_lemma_rule(encoded_from, string) else: return string
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if(not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_n = cfg.getint('filter', 'first_n') self.graph_dir = self.cfg.get('machine', 'graph_dir') ensure_dir(self.graph_dir)
def __init__(self, cfg, cfg_section="word_sim"): try: self.batch = cfg.getboolean(cfg_section, "batch") except NoSectionError: self.batch = False self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words("english")) self.expand = cfg.getboolean(cfg_section, "expand") logging.info("expand is {0}".format(self.expand))
def process(filename): global lemmatizer lemmatizer = Lemmatizer() raw_tweets = Tokenizer.parse(filename) raw_tweets, hashtags = Tokenizer.extract_hashtags(raw_tweets) raw_tweets, mentions = Tokenizer.extract_mentions(raw_tweets) raw_tweets, emojis = Tokenizer.extract_emojis(raw_tweets) tweets = [] for text, hashtag, mention, emoji in zip(raw_tweets, hashtags, mentions, emojis): tweets.append(Tweet(Tokenizer.tokenize( text), hashtag, mention, emoji)) return tweets
def __init__(self, cfg, cfg_section='word_sim'): self.batch = cfg.getboolean(cfg_section, 'batch') logging.warning("fourlangpath is {0}".format( cfg.get(cfg_section, 'fourlangpath'))) self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) self.sim_feats = SimFeatures(cfg, cfg_section, self.lexicon) self.expand = cfg.getboolean(cfg_section, "expand") compositional = cfg.getboolean('similarity', 'compositional') if compositional is True: self.text_to_4lang = TextTo4lang(cfg, direct_parse=True) logging.info("expand is {0}".format(self.expand)) self.allow_4lang = cfg.getboolean('machine', 'allow_4lang')
def test_usage(): with timing('Loading dictionary entries'): dict = load_dict('data/dict/polimorf-20190818.tab', limit=5000) with timing('Loading word vectors'): word_vectors = KeyedVectors.load_word2vec_format( 'data/nkjp+wiki-forms-all-300-skipg-ns.txt', limit=5000) with timing('Initializing POS tagger'): posTagger = Lemmatizer.create(dict, word_vectors) posTagger.load_model('data/disambiguation.h5') text = '5 kilogramów pomidorów trafiło do kuchnii. Zostały ugotowane na miękko.' chunks = tokenize(text) assert chunks[0].tokens[1].orth == 'kilogramów' print(chunks) posTagger.tag(chunks) assert chunks[0].tokens[1].disamb_lemma == 'kilogram' assert chunks[0].tokens[1].disamb_tag.startswith('noun') print(chunks)
class DepTo4lang(): dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)") def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {} def read_dep_map(self, dep_map_fn): self.dependencies = defaultdict(list) for line in file(dep_map_fn): l = line.strip() if not l or l.startswith('#'): continue dep = Dependency.create_from_line(l) self.dependencies[dep.name].append(dep) def apply_dep(self, dep, machine1, machine2): dep_type = dep['type'] msd1 = dep['gov'].get('msd') msd2 = dep['dep'].get('msd') if dep_type not in self.dependencies: if dep_type not in self.undefined: self.undefined.add(dep_type) logging.warning( 'skipping dependency not in dep_to_4lang map: {0}'.format( dep_type)) return False # not that anyone cares for dep in self.dependencies[dep_type]: dep.apply(msd1, msd2, machine1, machine2) def dep_to_4lang(self): dict_fn = self.cfg.get("dict", "output_file") logging.info('reading dependencies from {0}...'.format(dict_fn)) longman = json.load(open(dict_fn)) for c, (word, entry) in enumerate(longman.iteritems()): if c % 1000 == 0: logging.info("added {0}...".format(c)) try: if entry["to_filter"]: continue if not entry['senses']: # TODO these are words that only have pointers to an MWE # that they are part of. continue definition = entry['senses'][0]['definition'] if definition is None: continue deps = definition['deps'] if not deps: # TODO see previous comment continue machine = self.get_dep_definition(word, deps) if machine is None: continue # logging.info('adding: {0}'.format(word)) # logging.info('ext_lex_keys: {0}'.format( # self.lexicon.ext_lexicon.keys())) self.lexicon.add(word, machine) except Exception: logging.error(u"exception caused by: '{0}'".format(word)) # logging.error( # u'skipping "{0}" because of an exception:'.format( # word)) # logging.info("entry: {0}".format(entry)) traceback.print_exc() sys.exit(-1) continue logging.info('added {0}, done!'.format(c + 1)) def print_graphs(self): print_4lang_graphs( self.lexicon.ext_lexicon, self.cfg.get('machine', 'graph_dir')) def save_machines(self): self.lexicon.save_to_binary(self.out_fn) @staticmethod def parse_dependency(string): dep_match = DepTo4lang.dep_regex.match(string) if not dep_match: raise Exception('cannot parse dependency: {0}'.format(string)) dep, word1, id1, word2, id2 = dep_match.groups() return dep, (word1, id1), (word2, id2) def get_root_lemmas(self, deps): return [ d['dep'].setdefault( 'lemma', self.lemmatizer.lemmatize(d['dep']['word'])) for d in deps if d['type'] == 'root'] # TODO def get_dep_definition(self, word, deps): deps = self.dependency_processor.process_dependencies(deps) root_lemmas = self.get_root_lemmas(deps) if not root_lemmas: logging.warning( u'no root dependency, skipping word "{0}"'.format(word)) return None word2machine = self.get_machines_from_deps_and_corefs( [deps], [], process_deps=False) root_machines = filter(None, map(word2machine.get, root_lemmas)) if not root_machines: logging.info("failed to find root machine") logging.info('root lemmas: {0}'.format(root_lemmas)) logging.info('word2machine: {0}'.format(word2machine)) sys.exit(-1) word_machine = self.lexicon.get_new_machine(word) for root_machine in root_machines: word_machine.unify(root_machine) word_machine.append(root_machine, 0) return word_machine def get_machines_from_deps_and_corefs( self, dep_lists, corefs, process_deps=True): if process_deps: dep_lists = map( self.dependency_processor.process_dependencies, dep_lists) coref_index = defaultdict(dict) for (word, sen_no), mentions in corefs: for m_word, m_sen_no in mentions: coref_index[m_word][m_sen_no-1] = word # logging.info('coref index: {0}'.format(coref_index)) word2machine = {} for deps in dep_lists: for dep in deps: for t in (dep['gov'], dep['dep']): self.word2lemma[t['word']] = t.setdefault( 'lemma', self.lemmatizer.lemmatize(t['word'])) for i, deps in enumerate(dep_lists): try: for dep in deps: word1 = dep['gov']['word'] word2 = dep['dep']['word'] # logging.info('dep: {0}, w1: {1}, w2: {2}'.format( # repr(dep), repr(word1), repr(word2))) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( word1, c_word1)) if c_word2 != word2: logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) """ lemma1 = self.word2lemma[c_word1] lemma2 = self.word2lemma[c_word2] # TODO # lemma1 = lemma1.replace('/', '_PER_') # lemma2 = lemma2.replace('/', '_PER_') # logging.info( # 'lemma1: {0}, lemma2: {1}'.format( # repr(lemma1), repr(lemma2))) for lemma in (lemma1, lemma2): if lemma not in word2machine: word2machine[lemma] = self.lexicon.get_new_machine( lemma) self.apply_dep( dep, word2machine[lemma1], word2machine[lemma2]) except: logging.error(u"failure on dep: {0}({1}, {2})".format( dep, word1, word2)) traceback.print_exc() raise Exception("adding dependencies failed") return word2machine
class WordSimilarity(): def __init__(self, cfg, cfg_section='word_sim'): self.batch = cfg.getboolean(cfg_section, 'batch') logging.warning("fourlangpath is {0}".format( cfg.get(cfg_section, 'fourlangpath'))) self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) self.sim_feats = SimFeatures(cfg, cfg_section, self.lexicon) self.expand = cfg.getboolean(cfg_section, "expand") compositional = cfg.getboolean('similarity', 'compositional') if compositional is True: self.text_to_4lang = TextTo4lang(cfg, direct_parse=True) logging.info("expand is {0}".format(self.expand)) self.allow_4lang = cfg.getboolean('machine', 'allow_4lang') def log(self, string): if not self.batch: logging.info(string) def sim_type_to_function(self, sim_type): return lambda w1, w2: self.word_similarities(w1, w2)[sim_type] def machine_similarities(self, machine1, machine2, machine1_expand, machine2_expand): pn1, pn2 = machine1.printname(), machine2.printname() self.log(u'machine1: {0}, machine2: {1}'.format(pn1, pn2)) links1, nodes1 = self.get_links_nodes(machine1) links2, nodes2 = self.get_links_nodes(machine2) links1_expand, nodes1_expand = self.get_links_nodes(machine1_expand) links2_expand, nodes2_expand = self.get_links_nodes(machine2_expand) self.log('links1: {0}, links2: {1}'.format(links1, links2)) self.log('nodes1: {0}, nodes2: {1}'.format(nodes1, nodes2)) self.log('links1_expand: {0}, links2_expand: {1}'.format(links1_expand, links2_expand)) self.log('nodes1_expand: {0}, nodes2_expand: {1}'.format(nodes1_expand, nodes2_expand)) sims = self.sim_feats.get_all_features( MachineInfo(machine1, nodes1, nodes1_expand, links1, links1_expand), MachineInfo(machine2, nodes2, nodes2_expand, links2, links2_expand)) # TODO: we should use this way, but so far it didn't prove to be better # if sims['is_antonym'] == 1: # sims['shortest_path'] = 0 return sims def lemma_similarities(self, lemma1, lemma2): if (lemma1, lemma2) in self.lemma_sim_cache: return self.lemma_sim_cache[(lemma1, lemma2)] if lemma1 == lemma2: lemma_sims = self.sim_feats.one_similarities() machine1, machine2 = map( lambda l: self.lexicon.get_machine(l, allow_4lang=self.allow_4lang), (lemma1, lemma2)) machine1_expand, machine2_expand = map( self.lexicon.get_expanded_definition, (lemma1, lemma2)) if not self.batch: for w, m in ((lemma1, machine1), (lemma2, machine2)): print_4lang_graph(w, m, self.graph_dir) for w, m in ((lemma1, machine1_expand), (lemma2, machine2_expand)): print_4lang_graph(w, m, self.graph_dir + "_expand") lemma_sims = self.machine_similarities(machine1, machine2, machine1_expand, machine2_expand) self.lemma_sim_cache[(lemma1, lemma2)] = lemma_sims self.lemma_sim_cache[(lemma2, lemma1)] = lemma_sims return lemma_sims def word_similarities(self, word1, word2): if (word1, word2) in self.word_sim_cache: return self.word_sim_cache[(word1, word2)] lemma1, lemma2 = [self.lemmatizer.lemmatize( word, defined=self.defined_words, stem_first=True, uppercase=True) for word in (word1, word2)] # self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2)) if lemma1 is None or lemma2 is None: if lemma1 is None: logging.debug("OOV: {0}".format(word1)) if lemma2 is None: logging.debug("OOV: {0}".format(word2)) word_sims = self.sim_feats.zero_similarities() else: word_sims = self.lemma_similarities(lemma1, lemma2) self.word_sim_cache[(word1, word2)] = word_sims self.word_sim_cache[(word2, word1)] = word_sims return word_sims def phrase_similarities(self, phrase1, phrase2): words1 = phrase1.split(' ') words2 = phrase2.split(' ') if (len(words1) == 1 and len(words2) == 1): return self.word_similarities(phrase1, phrase2) else: # TODO: cache! machine1 = self.text_to_4lang.process_phrase(phrase1) machine2 = self.text_to_4lang.process_phrase(phrase2) nodes1 = self.get_nodes_from_text_machine(machine1) nodes2 = self.get_nodes_from_text_machine(machine2) sims = self.sim_feats.get_all_features( MachineInfo(machine1, nodes1, nodes1, None, None, has_printname=False), MachineInfo(machine2, nodes2, nodes2, None, None, has_printname=False)) print "{0}\t{1}\t{2}".format(phrase1, phrase2, sims) return sims def get_nodes_from_text_machine(self, machine, excludes=["ROOT"]): return set( itertools.chain(*[self._get_all_nodes(k) for k in machine.values() if k.printname() not in set(excludes)])) # return [k for k in set(machine.keys()).difference(set(excludes))] def _get_all_nodes(self, machine): nodes = [m for m in MachineTraverser.get_nodes(machine, names_only=True, keep_upper=False)] return nodes def get_links_nodes(self, machine, use_cache=True): if use_cache and machine in self.links_nodes_cache: return self.links_nodes_cache[machine] self.seen_for_links = set() links, nodes = self._get_links_and_nodes(machine, depth=0) links, nodes = set(links), set(nodes) links.add(machine.printname()) nodes.add(machine.printname()) self.links_nodes_cache[machine] = (links, nodes) return links, nodes def _get_links_and_nodes(self, machine, depth, exclude_links=False): name = machine.printname() if name.isupper() or name == '=AGT': links, nodes = [], [] elif exclude_links: links, nodes = [], [name] else: links, nodes = [name], [name] # logging.info("{0}{1},{2}".format(depth*" ", links, nodes)) is_negated = False is_before = False if machine in self.seen_for_links or depth > 5: return [], [] self.seen_for_links.add(machine) for i, part in enumerate(machine.partitions): for hypernym in part: h_name = hypernym.printname() # logging.info("{0}h: {1}".format(depth*" ", h_name)) if h_name in ("lack", "not", "before"): is_negated = True continue c_links, c_nodes = self._get_links_and_nodes( hypernym, depth=depth + 1, exclude_links=i != 0) if not h_name.isupper(): links += c_links nodes += c_nodes if not exclude_links: links += self.get_binary_links(machine) if is_negated: add_lack = lambda link: "lack_{0}".format(link) if isinstance(link, unicode) else ( "lack_{0}".format(link[0]), link[1]) # nopep8 links = map(add_lack, links) nodes = map(add_lack, nodes) return links, nodes def get_binary_links(self, machine): for parent, partition in machine.parents: parent_pn = parent.printname() # if not parent_pn.isupper() or partition == 0: if partition == 0: # haven't seen it yet but possible continue elif partition == 1: links = set([(parent_pn, other.printname()) for other in parent.partitions[2]]) elif partition == 2: links = set([(other.printname(), parent_pn) for other in parent.partitions[1]]) else: raise Exception( 'machine {0} has more than 3 partitions!'.format(machine)) for link in links: yield link def contains(self, links, machine): pn = machine.printname() for link in links: if link == pn or (pn in link and isinstance(link, tuple)): self.log('link "{0}" is/contains name "{1}"'.format(link, pn)) return True else: return False
class DepTo4lang(): dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)") def __init__(self, cfg): self.cfg = cfg self.out_fn = self.cfg.get("machine", "ext_definitions") ensure_dir(os.path.dirname(self.out_fn)) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.lemmatizer = Lemmatizer(cfg) def read_dep_map(self, dep_map_fn): self.dependencies = {} for line in file(dep_map_fn): l = line.strip() if not l or l.startswith('#'): continue dep = Dependency.create_from_line(l) self.dependencies[dep.name] = dep def apply_dep(self, dep_str, machine1, machine2): if dep_str not in self.dependencies: logging.warning( 'skipping dependency not in dep_to_4lang map: {0}'.format( dep_str)) return False # not that anyone cares self.dependencies[dep_str].apply(machine1, machine2) def dep_to_4lang(self): dict_fn = self.cfg.get("dict", "output_file") logging.info('reading dependencies from {0}...'.format(dict_fn)) longman = json.load(open(dict_fn)) self.words_to_machines = {} for c, (word, entry) in enumerate(longman.iteritems()): if c % 1000 == 0: logging.info("added {0}...".format(c)) try: if entry["to_filter"]: continue if not entry['senses']: # TODO these are words that only have pointers to an MWE # that they are part of. continue definition = entry['senses'][0]['definition'] if definition is None: continue deps = definition['deps'] if not deps: # TODO see previous comment continue machine = self.get_dep_definition(word, deps) if machine is None: continue self.words_to_machines[word] = machine except Exception: logging.error( u'skipping "{0}" because of an exception:'.format( word)) logging.info("entry: {0}".format(entry)) traceback.print_exc() continue logging.info('done!') def print_graphs(self): print_4lang_graphs( self.words_to_machines, self.cfg.get('machine', 'graph_dir')) def save_machines(self): logging.info('saving machines to {0}...'.format(self.out_fn)) with open(self.out_fn, 'w') as out_file: cPickle.dump(self.words_to_machines, out_file) logging.info('done!') @staticmethod def parse_dependency(string): dep_match = DepTo4lang.dep_regex.match(string) if not dep_match: raise Exception('cannot parse dependency: {0}'.format(string)) dep, word1, id1, word2, id2 = dep_match.groups() return dep, (word1, id1), (word2, id2) def get_dep_definition(self, word, deps): root_deps = filter(lambda d: d[0] == 'root', deps) if len(root_deps) != 1: logging.warning( u'no unique root dependency, skipping word "{0}"'.format(word)) return None root_word, root_id = root_deps[0][2] root_lemma = self.lemmatizer.lemmatize(root_word).replace('/', '_PER_') root_lemma = root_word if not root_lemma else root_lemma word2machine = self.get_machines_from_parsed_deps(deps) root_machine = word2machine[root_lemma] word_machine = word2machine.get(word, Machine(word, ConceptControl())) word_machine.append(root_machine, 0) return word_machine def get_machines_from_deps(self, dep_strings): # deprecated, use get_machines_from_deps_and_corefs deps = map(DepTo4lang.parse_dependency, dep_strings) return self.get_machines_from_parsed_deps(deps) def get_machines_from_parsed_deps(self, deps): # deprecated, use get_machines_from_deps_and_corefs return self.get_machines_from_deps_and_corefs([deps], []) def get_machines_from_deps_and_corefs(self, dep_lists, corefs): coref_index = defaultdict(dict) for (word, sen_no), mentions in corefs: for m_word, m_sen_no in mentions: coref_index[m_word][m_sen_no-1] = word # logging.info('coref index: {0}'.format(coref_index)) lexicon = Lexicon() word2machine = {} for i, deps in enumerate(dep_lists): try: for dep, (word1, id1), (word2, id2) in deps: # logging.info('w1: {0}, w2: {1}'.format(word1, word2)) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( word1, c_word1)) if c_word2 != word2: logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) """ # logging.info( # 'cw1: {0}, cw2: {1}'.format(c_word1, c_word2)) lemma1 = self.lemmatizer.lemmatize(c_word1) lemma2 = self.lemmatizer.lemmatize(c_word2) lemma1 = c_word1 if not lemma1 else lemma1 lemma2 = c_word2 if not lemma2 else lemma2 # TODO lemma1 = lemma1.replace('/', '_PER_') lemma2 = lemma2.replace('/', '_PER_') # logging.info( # 'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2)) machine1, machine2 = self._add_dependency( dep, (lemma1, id1), (lemma2, id2), lexicon) word2machine[lemma1] = machine1 word2machine[lemma2] = machine2 except: logging.error("failure on dep: {0}({1}, {2})".format( dep, word1, word2)) traceback.print_exc() raise Exception("adding dependencies failed") return word2machine def _add_dependency(self, dep, (word1, id1), (word2, id2), lexicon): """Given a triplet from Stanford Dep.: D(w1,w2), we create and activate machines for w1 and w2, then run all operators associated with D on the sequence of the new machines (m1, m2)""" # logging.info( # 'adding dependency {0}({1}, {2})'.format(dep, word1, word2)) machine1, machine2 = map(lexicon.get_machine, (word1, word2)) self.apply_dep(dep, machine1, machine2) return machine1, machine2
class DepTo4lang(): dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)") def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {} def read_dep_map(self, dep_map_fn): self.dependencies = defaultdict(list) for line in file(dep_map_fn): l = line.strip() if not l or l.startswith('#'): continue dep = Dependency.create_from_line(l) self.dependencies[dep.name].append(dep) def apply_dep(self, dep, machine1, machine2): dep_type = dep['type'] msd1 = dep['gov'].get('msd') msd2 = dep['dep'].get('msd') if dep_type not in self.dependencies: if dep_type not in self.undefined: self.undefined.add(dep_type) logging.warning( 'skipping dependency not in dep_to_4lang map: {0}'.format( dep_type)) return False # not that anyone cares for dep in self.dependencies[dep_type]: dep.apply(msd1, msd2, machine1, machine2) def dep_to_4lang(self): dict_fn = self.cfg.get("dict", "output_file") logging.info('reading dependencies from {0}...'.format(dict_fn)) longman = json.load(open(dict_fn)) for c, (word, entry) in enumerate(longman.iteritems()): if c % 1000 == 0: logging.info("added {0}...".format(c)) try: if entry["to_filter"]: continue if not entry['senses']: # TODO these are words that only have pointers to an MWE # that they are part of. continue definition = entry['senses'][0]['definition'] if definition is None: continue deps = definition['deps'] if not deps: # TODO see previous comment continue machine = self.get_dep_definition(word, deps) if machine is None: continue # logging.info('adding: {0}'.format(word)) # logging.info('ext_lex_keys: {0}'.format( # self.lexicon.ext_lexicon.keys())) self.lexicon.add(word, machine) except Exception: logging.error(u"exception caused by: '{0}'".format(word)) # logging.error( # u'skipping "{0}" because of an exception:'.format( # word)) # logging.info("entry: {0}".format(entry)) traceback.print_exc() sys.exit(-1) continue logging.info('added {0}, done!'.format(c + 1)) def print_graphs(self): print_4lang_graphs(self.lexicon.ext_lexicon, self.cfg.get('machine', 'graph_dir')) def save_machines(self): self.lexicon.save_to_binary(self.out_fn) @staticmethod def parse_dependency(string): dep_match = DepTo4lang.dep_regex.match(string) if not dep_match: raise Exception('cannot parse dependency: {0}'.format(string)) dep, word1, id1, word2, id2 = dep_match.groups() return dep, (word1, id1), (word2, id2) def get_root_lemmas(self, deps): return [ d['dep'].setdefault('lemma', self.lemmatizer.lemmatize(d['dep']['word'])) for d in deps if d['type'] == 'root' ] # TODO def get_dep_definition(self, word, deps): deps = self.dependency_processor.process_dependencies(deps) root_lemmas = self.get_root_lemmas(deps) if not root_lemmas: logging.warning( u'no root dependency, skipping word "{0}"'.format(word)) return None word2machine = self.get_machines_from_deps_and_corefs( [deps], [], process_deps=False) root_machines = filter(None, map(word2machine.get, root_lemmas)) if not root_machines: logging.info("failed to find root machine") logging.info('root lemmas: {0}'.format(root_lemmas)) logging.info('word2machine: {0}'.format(word2machine)) sys.exit(-1) word_machine = self.lexicon.get_new_machine(word) for root_machine in root_machines: word_machine.unify(root_machine) word_machine.append(root_machine, 0) return word_machine def get_machines_from_deps_and_corefs(self, dep_lists, corefs, process_deps=True): if process_deps: dep_lists = map(self.dependency_processor.process_dependencies, dep_lists) coref_index = defaultdict(dict) for (word, sen_no), mentions in corefs: for m_word, m_sen_no in mentions: coref_index[m_word][m_sen_no - 1] = word # logging.info('coref index: {0}'.format(coref_index)) word2machine = {} for deps in dep_lists: for dep in deps: for t in (dep['gov'], dep['dep']): self.word2lemma[t['word']] = t.setdefault( 'lemma', self.lemmatizer.lemmatize(t['word'])) for i, deps in enumerate(dep_lists): try: for dep in deps: word1 = dep['gov']['word'] word2 = dep['dep']['word'] # logging.info('dep: {0}, w1: {1}, w2: {2}'.format( # repr(dep), repr(word1), repr(word2))) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( word1, c_word1)) if c_word2 != word2: logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) """ lemma1 = self.word2lemma[c_word1] lemma2 = self.word2lemma[c_word2] # TODO # lemma1 = lemma1.replace('/', '_PER_') # lemma2 = lemma2.replace('/', '_PER_') # logging.info( # 'lemma1: {0}, lemma2: {1}'.format( # repr(lemma1), repr(lemma2))) for lemma in (lemma1, lemma2): if lemma not in word2machine: word2machine[lemma] = self.lexicon.get_new_machine( lemma) self.apply_dep(dep, word2machine[lemma1], word2machine[lemma2]) except: logging.error(u"failure on dep: {0}({1}, {2})".format( dep, word1, word2)) traceback.print_exc() raise Exception("adding dependencies failed") return word2machine
class WordSimilarity(): def __init__(self, cfg, cfg_section='word_sim'): self.batch = cfg.getboolean(cfg_section, 'batch') logging.warning("fourlangpath is {0}".format( cfg.get(cfg_section, 'fourlangpath'))) self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) self.sim_feats = SimFeatures(cfg, cfg_section) self.expand = cfg.getboolean(cfg_section, "expand") logging.info("expand is {0}".format(self.expand)) def log(self, string): if not self.batch: logging.info(string) def sim_type_to_function(self, sim_type): return lambda w1, w2: self.word_similarities(w1, w2)[sim_type] def machine_similarities(self, machine1, machine2, machine1_expand, machine2_expand): pn1, pn2 = machine1.printname(), machine2.printname() self.log(u'machine1: {0}, machine2: {1}'.format(pn1, pn2)) links1, nodes1 = self.get_links_nodes(machine1) links2, nodes2 = self.get_links_nodes(machine2) links1_expand, nodes1_expand = self.get_links_nodes(machine1_expand) links2_expand, nodes2_expand = self.get_links_nodes(machine2_expand) self.log('links1: {0}, links2: {1}'.format(links1, links2)) self.log('nodes1: {0}, nodes2: {1}'.format(nodes1, nodes2)) self.log('links1_expand: {0}, links2_expand: {1}'.format(links1_expand, links2_expand)) self.log('nodes1_expand: {0}, nodes2_expand: {1}'.format(nodes1_expand, nodes2_expand)) sims = self.sim_feats.get_all_features(MachineInfo(machine1_expand, nodes1, nodes1_expand, links1, links1_expand), MachineInfo(machine2_expand, nodes2, nodes2_expand, links2, links2_expand)) return sims def lemma_similarities(self, lemma1, lemma2): if (lemma1, lemma2) in self.lemma_sim_cache: return self.lemma_sim_cache[(lemma1, lemma2)] if lemma1 == lemma2: lemma_sims = self.sim_feats.one_similarities() machine1, machine2 = map( self.lexicon.get_machine, (lemma1, lemma2)) machine1_expand, machine2_expand = map( self.lexicon.get_expanded_definition, (lemma1, lemma2)) if not self.batch: for w, m in ((lemma1, machine1), (lemma2, machine2)): print_4lang_graph(w, m, self.graph_dir) for w, m in ((lemma1, machine1_expand), (lemma2, machine2_expand)): print_4lang_graph(w, m, self.graph_dir + "_expand") lemma_sims = self.machine_similarities(machine1, machine2, machine1_expand, machine2_expand) self.lemma_sim_cache[(lemma1, lemma2)] = lemma_sims self.lemma_sim_cache[(lemma2, lemma1)] = lemma_sims return lemma_sims def word_similarities(self, word1, word2): if (word1, word2) in self.word_sim_cache: return self.word_sim_cache[(word1, word2)] lemma1, lemma2 = [self.lemmatizer.lemmatize( word, defined=self.defined_words, stem_first=True) for word in (word1, word2)] # self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2)) if lemma1 is None or lemma2 is None: if lemma1 is None: logging.debug("OOV: {0}".format(word1)) if lemma2 is None: logging.debug("OOV: {0}".format(word2)) word_sims = self.sim_feats.zero_similarities() else: word_sims = self.lemma_similarities(lemma1, lemma2) self.word_sim_cache[(word1, word2)] = word_sims self.word_sim_cache[(word2, word1)] = word_sims return word_sims def get_links_nodes(self, machine, use_cache=True): if use_cache and machine in self.links_nodes_cache: return self.links_nodes_cache[machine] self.seen_for_links = set() links, nodes = self._get_links_and_nodes(machine, depth=0) links, nodes = set(links), set(nodes) links.add(machine.printname()) nodes.add(machine.printname()) self.links_nodes_cache[machine] = (links, nodes) return links, nodes def _get_links_and_nodes(self, machine, depth, exclude_links=False): name = machine.printname() if name.isupper() or name == '=AGT': links, nodes = [], [] elif exclude_links: links, nodes = [], [name] else: links, nodes = [name], [name] # logging.info("{0}{1},{2}".format(depth*" ", links, nodes)) is_negated = False is_before = False if machine in self.seen_for_links or depth > 5: return [], [] self.seen_for_links.add(machine) for i, part in enumerate(machine.partitions): for hypernym in part: h_name = hypernym.printname() # logging.info("{0}h: {1}".format(depth*" ", h_name)) if h_name in ("lack", "not", "before"): is_negated = True continue c_links, c_nodes = self._get_links_and_nodes( hypernym, depth=depth+1, exclude_links=i != 0) if not h_name.isupper(): links += c_links nodes += c_nodes if not exclude_links: links += self.get_binary_links(machine) if is_negated: add_lack = lambda link: "lack_{0}".format(link) if isinstance(link, unicode) else ("lack_{0}".format(link[0]), link[1]) # nopep8 links = map(add_lack, links) nodes = map(add_lack, nodes) return links, nodes def get_binary_links(self, machine): for parent, partition in machine.parents: parent_pn = parent.printname() # if not parent_pn.isupper() or partition == 0: if partition == 0: # haven't seen it yet but possible continue elif partition == 1: links = set([(parent_pn, other.printname()) for other in parent.partitions[2]]) elif partition == 2: links = set([(other.printname(), parent_pn) for other in parent.partitions[1]]) else: raise Exception( 'machine {0} has more than 3 partitions!'.format(machine)) for link in links: yield link def contains(self, links, machine): pn = machine.printname() for link in links: if link == pn or (pn in link and isinstance(link, tuple)): self.log('link "{0}" is/contains name "{1}"'.format(link, pn)) return True else: return False
def __init__(self, wiki_file: str) -> None: self.wiki_file = wiki_file self.l = Lemmatizer() # noqa
from n_grams import N_grams projeto = 'jquery' arquivos = listdir('pull requests ' + projeto) raw = '' for arquivo in arquivos: with open('pull requests ' + projeto + '/' + arquivo) as json_file: data = json.load(json_file) raw = raw + str(data['body']) texto, codigo = separar_codigo.separar(raw) tokens = Tokennizer.tokenize(texto) # stemmed_list = Stemmer.stemmer(tokens) stop_worded_list = Stop_words.stop_words(tokens) lemmatized_list = Lemmatizer.lemmatizer(stop_worded_list) bigrams = N_grams.n_grams(lemmatized_list, 2) trigrams = N_grams.n_grams(lemmatized_list, 3) f_bi = FreqDist(bigrams) f_tri = FreqDist(trigrams) frequentes = FreqDist(lemmatized_list) porcentagem = int(len(frequentes) / 10) f = open('vocabularios/' + projeto + '/tokens.txt', 'w', encoding="utf-8") for item in frequentes.most_common(porcentagem): # print(str(item[0]) + '; ' + str(item[1])) f.write(str(item[0]) + '; ' + str(item[1]) + '\n') f.close()
import sys from lemmatizer import Lemmatizer src = sys.argv[1] tgt = sys.argv[2] lemm_cz = Lemmatizer(src, "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % src, "il2", path="/home/big_maggie/usr/nmt_scripts/liblemm.so") lemm_en = Lemmatizer(tgt, "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % tgt, "il2", path="/home/big_maggie/usr/nmt_scripts/liblemm.so") #TODO: pro kazdou vetu nahradit entity, ktere vytvoril tokenizator, opet puvodnimi tokeny for line in sys.stdin: #line=line.decode('utf-8') print('\t'.join( (str(lemm_cz.get_lang(line, 0.5, src)), str(lemm_en.get_lang(line, 0.5, tgt)))))
def main(): # read data from the raw data file file_reader = FileReader('train.csv') # get text from raw data train = file_reader.get_text() # get label and class from raw data labels, cla = file_reader.get_labels() # because all the basic function are implemented by ourself in this project # it will take a longer time to do the data preprocessing compare with nltk inbuild function # Therefore, we used only 10k data to test from alg here train_list = list(train)[:10000] # store data after cleaning print( 'Clean the data, remove special punctuations, numbers and abbreviations....' ) clean_list = list() cleaner = DataClean() for train_data in train_list: clean_list.append(cleaner.clean(train_data)) print('Data clean done!') print('') tkn = Tokenizer() # train a random forest pos tagger classfication model print('Training a pos tagger classfication model....') pos_tagger, onehot_enc = train_pos_tag() print('Model training done!') print('') text_list = list() # split text into sents before pos_tag print('Start tokenizing and lemmatizing....') print('This step will take a few minutes') for clean_data in clean_list: sents = tkn.sent_tokenize(clean_data) text_list.append(sents) # features for pos_tag features = [ 'word', 'is_first_word', 'is_last_word', 'prev_word', 'prev_word_last_1', 'prev_word_last_2', 'next_word', 'is_numeric', 'first_1', 'first_2', 'first_3', 'first_4', 'last_1', 'last_2', 'last_3', 'last_4', 'is_numeric', 'word_has_hyphen' ] # init Lemmatizer lem = Lemmatizer() lem_texts = list() # tokenize, pos_tag and lammatize sentence by sentence for sents in text_list: word_features = pd.DataFrame(get_data_label(sents, label=False)) # some data is empty if not word_features.empty: word_encode = word_features[features].values word_encode = onehot_enc.transform(word_encode) pred_pos = pos_tagger.predict(word_encode) lem_text = list() text = word_features.word for index in range(len(text)): lem_text.append( lem.lemmatize(text[index], tag_map(pred_pos[index]))) lem_texts.append(lem_text) else: lem_texts.append([]) print('Done!') print('') print('Start building the Vocabulary for our data....') voc = Vocabulary(lem_texts) voc.remove_stop_words() print('Done!') print('') print('Calculating idf....') print('It may take 3 minutes in this step') # get idf word dict from Vocabulary idf_reference = voc.idf() idf = np.zeros([len(voc)]) for word in idf_reference: idf[voc.pos(word)] = idf_reference[word] print('idf done!') print('') # the tf-idf encode array data_array = np.zeros([len(lem_texts), len(voc)], dtype='int16') print('Calculating tf-idf....') for index, text in enumerate(lem_texts): vec = Vector(text, voc) data_array[index] = idf * vec.tf() print('Done!') print('') X, Y, test_X, test_Y = train_test_split(data_array, labels, test_size=0.5) # split the train set into 5 fold for Cross Validation # However Cross Validation is time consuming and not necessary in this project # We just use one val set to choose the best threshold k = 5 fold_list = k_fold(X, k=k) one_size = len(fold_list[0]) train_X = np.zeros([one_size * 4, test_X.shape[1]]) train_Y = np.zeros([one_size * 4, 6], dtype='int64') # split train dataset and validation dataset for index, fold in enumerate(fold_list): if index != k - 1: train_X[index * one_size:index * one_size + one_size] = X[fold] train_Y[index * one_size:index * one_size + one_size] = Y[fold] else: val_X = X[fold] val_Y = Y[fold] preds = np.zeros((len(val_X), len(cla))) Pred_test = np.zeros((len(test_X), len(cla))) # We use LogisticRegression to train 6 models for each cat for index, cat in enumerate(cla): print('fit', cat) m, r = get_mdl(train_Y[:, index], train_X) preds[:, index] = m.predict_proba(val_X * r)[:, 1] Pred_test[:, index] = m.predict_proba(test_X * r)[:, 1] # searching for the best threshold threshold = [0.55, 0.6, 0.65, 0.7, 0.75] reslut_list = list() for t in threshold: sum_result = 0 row, col = preds.shape pred_Y = np.zeros([row, col]) for i in range(row): for j in range(col): if preds[i, j] >= t: pred_Y[i, j] = 1 else: pred_Y[i, j] = 0 # print out the pred result print(f'Validation set Accuracy (threshold={t}):') for index, cat in enumerate(cla): result = (pred_Y[:, index] == val_Y[:, index]).sum() / len(pred_Y) sum_result += result print(f'{cat} : {result}') print('') reslut_list.append(sum_result) # Using the best threshold pred test data set t = threshold[np.argmax(np.array(reslut_list))] print(f'The best threshold is {t}') row, col = Pred_test.shape pred_test_Y = np.zeros([row, col]) for i in range(row): for j in range(col): if Pred_test[i, j] >= t: pred_test_Y[i, j] = 1 else: pred_test_Y[i, j] = 0 print('') print('#######################################') print('#######################################') print(f'Test set Accuracy (threshold={t}):') for index, cat in enumerate(cla): result = (pred_test_Y[:, index] == test_Y[:, index]).sum() / len(pred_test_Y) print(f'{cat} : {result}')
def __init__(self, dictionary): self.dictionary = dictionary self.lemmatizer = Lemmatizer(dictionary) self.rules = RULES self.tag_query_cache = { } # runtime use for tag query in dictionary
class Parser: class Chart: class Vertex: def __init__(self, token=''): self.token = token class Edge: def __init__(self, lpos:int, rpos:int, unit:str, state:list): self.lpos = lpos self.rpos = rpos self.unit = unit self.state = state # the right unscanned part def __lt__(self, other): if self.lpos != other.lpos: return self.lpos < other.lpos elif self.rpos != other.rpos: return self.rpos < other.rpos else: return self.unit <= other.unit def __eq__(self, other): return (self.lpos == other.lpos and self.rpos == other.rpos and self.unit == other.unit and self.state == other.state) def __init__(self): self.edges_active = [ ] self.edges_inactive = [ ] self.vertexes = [ ] # the rank is crucial def __str__(self): lines, vertex_flow = [ ], '' for i, v in enumerate(self.vertexes): vertex_flow += ' <%d> %s' % (i + 1, v.token) vertex_flow += ' <%d> ' % (len(self.vertexes) + 1) nlen = len(vertex_flow) lines.append('=' * nlen) lines.append(vertex_flow) lines.append('-' * nlen) rel = { } # { (int, int): [str] } for e in self.edges_inactive: span = (e.lpos, e.rpos) if span in rel: rel[span].append(e.unit) else: rel[span] = [e.unit] for k in sorted(rel): lines.append('%r: %r' % (k, rel[k])) lines.append('>> found %d releations/edges.' % len(self.edges_inactive)) lines.append('') return '\n'.join(lines) def add_vertex(self, label:str): self.vertexes.append(self.Vertex(label)) return len(self.vertexes) def add_edge(self, lpos, rpos, tag, state:list=None): e = self.Edge(lpos, rpos, tag, state) if state is not None: if e not in self.edges_active: self.edges_active.append(e) else: if e not in self.edges_inactive: self.edges_inactive.append(e) INSTANE = None def __new__(cls, *args, **kwargs): if not cls.INSTANE: cls.INSTANE = super().__new__(cls) return cls.INSTANE def __init__(self, dictionary): self.dictionary = dictionary self.lemmatizer = Lemmatizer(dictionary) self.rules = RULES self.tag_query_cache = { } # runtime use for tag query in dictionary def parse(self, sent): tokens = sent.split() agenda, agenda_hist = [ ], set() # stack and its visit record to aviod duplicate push chart = self.Chart() while agenda or tokens: if not agenda: tok, tokens = tokens[0], tokens[1:] tok = self.lemmatizer.lemmatize(tok) tags = self.tag_query_cache.get(tok) if not tags: tags = {k for k, v in VOCABULARY.items() if tok in v} if not tags: dtags = (tok in self.dictionary and {tag for tag, _ in self.dictionary[tok]} or set()) tags = fuck_dtags_to_tags(dtags) self.tag_query_cache[tok] = tags idx = chart.add_vertex(tok) for tag in tags: todo = (tag, idx, idx + 1) agenda.append(todo) agenda_hist.add(todo) else: target, lpos, rpos = agenda.pop() for unit, unscanned in self.rules: if unscanned and unscanned[0] == target: if len(unscanned) > 1: chart.add_edge(lpos, rpos, unit, unscanned[1:]) else: todo = (unit, lpos, rpos) if todo not in agenda_hist: agenda.append(todo) agenda_hist.add(todo) chart.add_edge(lpos, rpos, target) for e in chart.edges_active: # rule alive: str, [] => unit, unscanned unit, unscanned = e.unit, e.state if unscanned and unscanned[0] == target: if len(unscanned) > 1: chart.add_edge(e.lpos, rpos, unit, unscanned[1:]) else: todo = (unit, e.lpos, rpos) if todo not in agenda_hist: agenda.append(todo) agenda_hist.add(todo) # print(agenda) return chart
class WordSimilarity(): def __init__(self, cfg): try: self.batch = cfg.getboolean('similarity_machine', 'batch') except NoSectionError: self.batch = False self.cfg = cfg self.lemmatizer = Lemmatizer(cfg) self.machine_wrapper = MachineWrapper(cfg) self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) def log(self, string): if not self.batch: logging.info(string) def get_links_nodes(self, machine, use_cache=True): if use_cache and machine in self.links_nodes_cache: return self.links_nodes_cache[machine] self.seen_for_links = set() links = set() nodes = set() for link, node in self._get_links_nodes(machine, depth=0): if link is not None: links.add(link) if node is not None: nodes.add(node) self.links_nodes_cache[machine] = (links, nodes) return links, nodes def _get_links_nodes(self, machine, depth): if machine in self.seen_for_links or depth > 5: return self.seen_for_links.add(machine) for hypernym in machine.partitions[0]: name = hypernym.printname() if name == '=AGT' or not name.isupper(): # if depth == 0 and name not in ("lack", "to"): # TMP!!! yield name, None for link, node in self._get_links_nodes(hypernym, depth=depth+1): yield link, node for link, node in self.get_binary_links_nodes(machine): yield link, node for node in MachineTraverser.get_nodes(machine): yield None, node def get_binary_links_nodes(self, machine): for parent, partition in machine.parents: parent_pn = parent.printname() # if not parent_pn.isupper() or partition == 0: if partition == 0: # haven't seen it yet but possible continue elif partition == 1: links = set([(parent_pn, other.printname()) for other in parent.partitions[2]]) nodes = [m.printname() for m in parent.partitions[2]] elif partition == 2: links = set([(other.printname(), parent_pn) for other in parent.partitions[1]]) nodes = [m.printname() for m in parent.partitions[1]] else: raise Exception( 'machine {0} has more than 3 partitions!'.format(machine)) for link in links: yield link, None for node in nodes: yield None, node def link_similarity(self, links1, links2): pass def contains(self, links, machine): pn = machine.printname() for link in links: if link == pn or (pn in link and isinstance(link, tuple)): self.log('link "{0}" is/contains name "{1}"'.format(link, pn)) return True else: return False def machine_similarity(self, machine1, machine2, sim_type): pn1, pn2 = machine1.printname(), machine2.printname() self.log(u'machine1: {0}, machine2: {1}'.format(pn1, pn2)) if sim_type == 'default': # sim = harmonic_mean(( # self._all_pairs_similarity(machine1, machine2), # self._links_and_nodes_similarity(machine1, machine2))) sim = self._links_and_nodes_similarity(machine1, machine2) # exclude_nodes=True) # TMP!! elif sim_type == 'all_pairs': sim = self._all_pairs_similarity(machine1, machine2) elif sim_type == 'links_and_nodes': sim = self._links_and_nodes_similarity(machine1, machine2) elif sim_type == 'strict_links_and_nodes': sim = self._links_and_nodes_similarity(machine1, machine2, no_contain_score=True) elif sim_type == 'links': sim = self._links_and_nodes_similarity(machine1, machine2, exclude_nodes=True) elif sim_type == 'strict_links': sim = self._links_and_nodes_similarity(machine1, machine2, exclude_nodes=True, no_contain_score=True) else: raise Exception("unknown similarity type: {0}".format(sim_type)) return sim def _all_pairs_similarity(self, machine1, machine2): words1 = set(MachineTraverser.get_nodes(machine1, exclude_words=self.stopwords)) words2 = set(MachineTraverser.get_nodes(machine2, exclude_words=self.stopwords)) pair_sims_by_word = defaultdict(dict) for word1 in words1: for word2 in words2: sim = self.word_similarity(word1, word2, -1, -1, sim_type="strict_links_and_nodes") pair_sims_by_word[word1][word2] = sim if sim else 0.0 pair_sims_by_word[word2][word1] = sim if sim else 0.0 max_sims_by_word = dict(( (word, my_max(pair_sims_by_word[word].itervalues())) for word in words1 | words2)) sim = average((average((max_sims_by_word[w] for w in words1)), average((max_sims_by_word[w] for w in words2)))) # sim = max((my_max((max_sims_by_word[w] for w in words1)), # my_max((max_sims_by_word[w] for w in words2)))) if sim: self.log( "{0} - {1} all_pairs similarity: {2} based on: {3}".format( machine1.printname(), machine2.printname(), sim, pair_sims_by_word)) return sim def _links_and_nodes_similarity(self, machine1, machine2, exclude_nodes=False, no_contain_score=False): sim = 0 links1, nodes1 = self.get_links_nodes(machine1) links2, nodes2 = self.get_links_nodes(machine2) if not no_contain_score: if (self.contains(links1, machine2) or self.contains(links2, machine1)): sim = max(sim, 0.35) elif (not exclude_nodes) and (self.contains(nodes1, machine2) or self.contains(nodes2, machine1)): sim = max(sim, 0.25) self.log('links1: {0}, links2: {1}'.format(links1, links2)) self.log('nodes1: {0}, nodes2: {1}'.format(nodes1, nodes2)) if True: pn1, pn2 = machine1.printname(), machine2.printname() if pn1 in links2 or pn2 in links1: self.log( "{0} and {1} connected by 0-path, returning 1".format( pn1, pn2)) return 1 entities1 = filter(lambda l: "@" in l, links1) entities2 = filter(lambda l: "@" in l, links2) if entities1 or entities2: sim = max(sim, jaccard(entities1, entities2)) else: sim = max(sim, jaccard(links1, links2)) if not exclude_nodes: node_sim = jaccard(nodes1, nodes2) if node_sim > sim: self.log( 'picking node sim ({0}) over link sim ({1})'.format( node_sim, sim)) sim = node_sim return sim def word_similarity(self, word1, word2, pos1, pos2, sim_type='default', fallback=lambda a, b, c, d: None): self.log(u'words: {0}, {1}'.format(word1, word2)) lemma1, lemma2 = [self.lemmatizer.lemmatize( word, defined=self.machine_wrapper.definitions, stem_first=True) for word in (word1, word2)] self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2)) if lemma1 is None or lemma2 is None: return fallback(word1, word2, pos1, pos2) sim = self.lemma_similarity(lemma1, lemma2, sim_type) self.log(u"S({0}, {1}) = {2}".format(word1, word2, sim)) return sim def lemma_similarity(self, lemma1, lemma2, sim_type): if (lemma1, lemma2) in self.lemma_sim_cache: return self.lemma_sim_cache[(lemma1, lemma2)] elif lemma1 == lemma2: return 1 self.log(u'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2)) machines1 = self.machine_wrapper.definitions[lemma1] machines2 = self.machine_wrapper.definitions[lemma2] pairs_by_sim = sorted([ (self.machine_similarity(machine1, machine2, sim_type), (machine1, machine2)) for machine1 in machines1 for machine2 in machines2], reverse=True) sim, (machine1, machine2) = pairs_by_sim[0] sim = sim if sim >= 0 else 0 self.lemma_sim_cache[(lemma1, lemma2)] = sim self.lemma_sim_cache[(lemma2, lemma1)] = sim return sim
class WordSimilarity: sim_types = set( ["links_jaccard", "nodes_jaccard", "links_contain", "nodes_contain", "0-connected", "entities_jaccard"] ) def __init__(self, cfg, cfg_section="word_sim"): try: self.batch = cfg.getboolean(cfg_section, "batch") except NoSectionError: self.batch = False self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words("english")) self.expand = cfg.getboolean(cfg_section, "expand") logging.info("expand is {0}".format(self.expand)) def log(self, string): if not self.batch: logging.info(string) def uniform_similarities(self, s): return dict(((sim_type, s) for sim_type in WordSimilarity.sim_types)) # TODO return {sim_type: s for sim_type in WordSimilarity.sim_types} def zero_similarities(self): return self.uniform_similarities(0.0) def one_similarities(self): return self.uniform_similarities(1.0) def sim_type_to_function(self, sim_type): return lambda w1, w2: self.word_similarities(w1, w2)[sim_type] def machine_similarities(self, machine1, machine2): pn1, pn2 = machine1.printname(), machine2.printname() self.log(u"machine1: {0}, machine2: {1}".format(pn1, pn2)) sims = self.zero_similarities() links1, nodes1 = self.get_links_nodes(machine1) links2, nodes2 = self.get_links_nodes(machine2) self.log("links1: {0}, links2: {1}".format(links1, links2)) self.log("nodes1: {0}, nodes2: {1}".format(nodes1, nodes2)) if self.contains(links1, machine2) or self.contains(links2, machine1): sims["links_contain"] = 1 if self.contains(nodes1, machine2) or self.contains(nodes2, machine1): sims["nodes_contain"] = 1 pn1, pn2 = machine1.printname(), machine2.printname() # TODO if pn1 in links2 or pn2 in links1: sims["0-connected"] = 1 entities1 = filter(lambda l: "@" in l, links1) entities2 = filter(lambda l: "@" in l, links2) sims["entities_jaccard"] = jaccard(entities1, entities2) sims["links_jaccard"] = jaccard(links1, links2) sims["nodes_jaccard"] = jaccard(nodes1, nodes2) return sims def lemma_similarities(self, lemma1, lemma2): if (lemma1, lemma2) in self.lemma_sim_cache: return self.lemma_sim_cache[(lemma1, lemma2)] if lemma1 == lemma2: lemma_sims = self.one_similarities() if self.expand: machine1, machine2 = map(self.lexicon.get_expanded_definition, (lemma1, lemma2)) else: machine1, machine2 = map(self.lexicon.get_machine, (lemma1, lemma2)) if not self.batch: for w, m in ((lemma1, machine1), (lemma2, machine2)): print_4lang_graph(w, m, self.graph_dir) lemma_sims = self.machine_similarities(machine1, machine2) self.lemma_sim_cache[(lemma1, lemma2)] = lemma_sims self.lemma_sim_cache[(lemma2, lemma1)] = lemma_sims return lemma_sims def word_similarities(self, word1, word2): if (word1, word2) in self.word_sim_cache: return self.word_sim_cache[(word1, word2)] lemma1, lemma2 = [ self.lemmatizer.lemmatize(word, defined=self.defined_words, stem_first=True) for word in (word1, word2) ] # self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2)) if lemma1 is None or lemma2 is None: if lemma1 is None: logging.debug("OOV: {0}".format(word1)) if lemma2 is None: logging.debug("OOV: {0}".format(word2)) # TODO word_sims = self.zero_similarities() else: word_sims = self.lemma_similarities(lemma1, lemma2) self.word_sim_cache[(word1, word2)] = word_sims self.word_sim_cache[(word2, word1)] = word_sims return word_sims def get_links_nodes(self, machine, use_cache=True): if use_cache and machine in self.links_nodes_cache: return self.links_nodes_cache[machine] self.seen_for_links = set() links, nodes = self._get_links_and_nodes(machine, depth=0) links, nodes = set(links), set(nodes) links.add(machine.printname()) nodes.add(machine.printname()) self.links_nodes_cache[machine] = (links, nodes) return links, nodes def _get_links_and_nodes(self, machine, depth, exclude_links=False): name = machine.printname() if name.isupper() or name == "=AGT": links, nodes = [], [] elif exclude_links: links, nodes = [], [name] else: links, nodes = [name], [name] # logging.info("{0}{1},{2}".format(depth*" ", links, nodes)) is_negated = False if machine in self.seen_for_links or depth > 5: return [], [] self.seen_for_links.add(machine) for i, part in enumerate(machine.partitions): for hypernym in part: h_name = hypernym.printname() # logging.info("{0}h: {1}".format(depth*" ", h_name)) if h_name in ("lack", "not"): is_negated = True continue c_links, c_nodes = self._get_links_and_nodes(hypernym, depth=depth + 1, exclude_links=i != 0) if not h_name.isupper(): links += c_links nodes += c_nodes if not exclude_links: links += self.get_binary_links(machine) if is_negated: add_lack = ( lambda link: "lack_{0}".format(link) if isinstance(link, unicode) else ("lack_{0}".format(link[0]), link[1]) ) # nopep8 links = map(add_lack, links) nodes = map(add_lack, nodes) return links, nodes def get_binary_links(self, machine): for parent, partition in machine.parents: parent_pn = parent.printname() # if not parent_pn.isupper() or partition == 0: if partition == 0: # haven't seen it yet but possible continue elif partition == 1: links = set([(parent_pn, other.printname()) for other in parent.partitions[2]]) elif partition == 2: links = set([(other.printname(), parent_pn) for other in parent.partitions[1]]) else: raise Exception("machine {0} has more than 3 partitions!".format(machine)) for link in links: yield link def contains(self, links, machine): pn = machine.printname() for link in links: if link == pn or (pn in link and isinstance(link, tuple)): self.log('link "{0}" is/contains name "{1}"'.format(link, pn)) return True else: return False
class DepTo4lang(): dep_regex = re.compile("([a-z_-]*)\((.*?)-([0-9]*)'*, (.*?)-([0-9]*)'*\)") def __init__(self, cfg): self.cfg = cfg self.out_fn = self.cfg.get("machine", "ext_definitions") ensure_dir(os.path.dirname(self.out_fn)) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.lemmatizer = Lemmatizer(cfg) def read_dep_map(self, dep_map_fn): self.dependencies = {} for line in file(dep_map_fn): l = line.strip() if not l or l.startswith('#'): continue dep = Dependency.create_from_line(l) self.dependencies[dep.name] = dep def apply_dep(self, dep_str, machine1, machine2): if dep_str not in self.dependencies: logging.warning( 'skipping dependency not in dep_to_4lang map: {0}'.format( dep_str)) return False # not that anyone cares self.dependencies[dep_str].apply(machine1, machine2) def dep_to_4lang(self): dict_fn = self.cfg.get("dict", "output_file") logging.info('reading dependencies from {0}...'.format(dict_fn)) longman = json.load(open(dict_fn)) self.words_to_machines = {} for c, (word, entry) in enumerate(longman.iteritems()): if c % 1000 == 0: logging.info("added {0}...".format(c)) try: if entry["to_filter"]: continue if not entry['senses']: # TODO these are words that only have pointers to an MWE # that they are part of. continue definition = entry['senses'][0]['definition'] if definition is None: continue deps = definition['deps'] if not deps: # TODO see previous comment continue machine = self.get_dep_definition(word, deps) if machine is None: continue self.words_to_machines[word] = machine except Exception: logging.error( u'skipping "{0}" because of an exception:'.format(word)) logging.info("entry: {0}".format(entry)) traceback.print_exc() continue logging.info('done!') def print_graphs(self): print_4lang_graphs(self.words_to_machines, self.cfg.get('machine', 'graph_dir')) def save_machines(self): logging.info('saving machines to {0}...'.format(self.out_fn)) with open(self.out_fn, 'w') as out_file: cPickle.dump(self.words_to_machines, out_file) logging.info('done!') @staticmethod def parse_dependency(string): dep_match = DepTo4lang.dep_regex.match(string) if not dep_match: raise Exception('cannot parse dependency: {0}'.format(string)) dep, word1, id1, word2, id2 = dep_match.groups() return dep, (word1, id1), (word2, id2) def get_dep_definition(self, word, deps): root_deps = filter(lambda d: d[0] == 'root', deps) if len(root_deps) != 1: logging.warning( u'no unique root dependency, skipping word "{0}"'.format(word)) return None root_word, root_id = root_deps[0][2] root_lemma = self.lemmatizer.lemmatize(root_word).replace('/', '_PER_') root_lemma = root_word if not root_lemma else root_lemma word2machine = self.get_machines_from_parsed_deps(deps) root_machine = word2machine[root_lemma] word_machine = word2machine.get(word, Machine(word, ConceptControl())) word_machine.append(root_machine, 0) return word_machine def get_machines_from_deps(self, dep_strings): # deprecated, use get_machines_from_deps_and_corefs deps = map(DepTo4lang.parse_dependency, dep_strings) return self.get_machines_from_parsed_deps(deps) def get_machines_from_parsed_deps(self, deps): # deprecated, use get_machines_from_deps_and_corefs return self.get_machines_from_deps_and_corefs([deps], []) def get_machines_from_deps_and_corefs(self, dep_lists, corefs): coref_index = defaultdict(dict) for (word, sen_no), mentions in corefs: for m_word, m_sen_no in mentions: coref_index[m_word][m_sen_no - 1] = word # logging.info('coref index: {0}'.format(coref_index)) lexicon = Lexicon() word2machine = {} for i, deps in enumerate(dep_lists): try: for dep, (word1, id1), (word2, id2) in deps: # logging.info('w1: {0}, w2: {1}'.format(word1, word2)) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( word1, c_word1)) if c_word2 != word2: logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) """ # logging.info( # 'cw1: {0}, cw2: {1}'.format(c_word1, c_word2)) lemma1 = self.lemmatizer.lemmatize(c_word1) lemma2 = self.lemmatizer.lemmatize(c_word2) lemma1 = c_word1 if not lemma1 else lemma1 lemma2 = c_word2 if not lemma2 else lemma2 # TODO lemma1 = lemma1.replace('/', '_PER_') lemma2 = lemma2.replace('/', '_PER_') # logging.info( # 'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2)) machine1, machine2 = self._add_dependency( dep, (lemma1, id1), (lemma2, id2), lexicon) word2machine[lemma1] = machine1 word2machine[lemma2] = machine2 except: logging.error("failure on dep: {0}({1}, {2})".format( dep, word1, word2)) traceback.print_exc() raise Exception("adding dependencies failed") return word2machine def _add_dependency(self, dep, (word1, id1), (word2, id2), lexicon): """Given a triplet from Stanford Dep.: D(w1,w2), we create and activate machines for w1 and w2, then run all operators associated with D on the sequence of the new machines (m1, m2)""" # logging.info( # 'adding dependency {0}({1}, {2})'.format(dep, word1, word2)) machine1, machine2 = map(lexicon.get_machine, (word1, word2)) self.apply_dep(dep, machine1, machine2) return machine1, machine2
from lemmatizer import Lemmatizer lemma = Lemmatizer() print(lemma.lemmatize('bersetubuh'), lemma.lemmatize('berdansa'), lemma.lemmatize('penamaan'), lemma.lemmatize('berusaha'), lemma.lemmatize('berdansa'), lemma.lemmatize('bolak-balik'), lemma.lemmatize('gemetar'), lemma.lemmatize('petanggungjawaban'), lemma.lemmatize('kepastian'), lemma.lemmatize('berpendidikan'), lemma.lemmatize('berhubungan'), lemma.lemmatize('berwawasan'), lemma.lemmatize('pengetahuan'), lemma.lemmatize('pengembala'), lemma.lemmatize('penarikan'), lemma.lemmatize('terbengkalai'), lemma.lemmatize('rumahku'), lemma.lemmatize('penanggulangan'), lemma.lemmatize('perpecahan'), lemma.lemmatize('pemalas'), lemma.lemmatize('tertikunganlah'), lemma.lemmatize('perdamaian'), lemma.lemmatize('terbirit-birit'), lemma.lemmatize('cebokan'), lemma.lemmatize('mengotomatisasikan'), lemma.lemmatize('menyelesaikan'), lemma.lemmatize('sekawasan'), lemma.lemmatize('pengertian'), lemma.lemmatize('ketidakpastian'))
def indexCorpus(): indexer = Indexer(database) # index normal articles indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles") indexer.compute_tf() indexer.compute_tf_idf() indexer.purge() # index lemmatized articles indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles_lemma") indexer.output_catalog = "./indexes_lemmatized/" indexer.compute_tf() indexer.compute_tf_idf() indexer.purge() if __name__ == "__main__": if (len(sys.argv) > 2): _usage() lemmatizer = Lemmatizer() lemmatizer.makeDictionaryMap() if (len(sys.argv) == 2): if (sys.argv[1] == 'index'): database = Database() lemmatizeCorpus(lemmatizer) indexCorpus() app.run()