def noun_phrase_func(doc): tn = TextNormalizer() noun_tokens = tn.fmap(doc["np"]) title_tokens = tn.tnormalize(doc["title"]) keyword_tokens = tn.fmap(doc["keywords"]) ners = __ners(doc["ner"]) return __stem__(unique(noun_tokens + title_tokens + keyword_tokens + ners))
def noun_phrase_func(doc): tn = TextNormalizer() noun_tokens = tn.fmap(doc["np"]) title_tokens = tn.tnormalize(doc["title"]) keyword_tokens = tn.fmap(doc["keywords"]) ners = __ners(doc["ner"]) return __stem__( unique(noun_tokens + title_tokens + keyword_tokens + ners) )
def get_lines(line_per_category): tn = TextNormalizer() for category in CATEGORIES.keys(): with open(wiki_path(category), "r+") as f: print("Loaded category {}".format(category)) i = 0 for line in f: if line: normalized = tn.normalize(line.lower().strip("\n")).strip() if normalized: yield (category, normalized) i += 1 if i % line_per_category == 0: break
def read(self): text_normalizer = TextNormalizer() documents = {} for tex_path in self.yield_files(): vec = self.with_open(tex_path) vec_norm = [ token for token in text_normalizer.fmap( vec ) if token.strip() ] vec_stop = self.remove_latex( vec_norm ) if vec_stop: tex_file = tex_path.split("/")[-1] documents[tex_file] = (tex_file, vec_stop) return documents
def read(self): text_normalizer = TextNormalizer() documents = {} for tex_path in self.yield_files(): vec = self.with_open(tex_path) vec_norm = [ token for token in text_normalizer.fmap(vec) if token.strip() ] vec_stop = self.remove_latex(vec_norm) if vec_stop: tex_file = tex_path.split("/")[-1] documents[tex_file] = (tex_file, vec_stop) return documents
def bbc_parser(doc, category): try: signal.alarm(15) s = doc.split("\n") title = s[0] _id = normalize_title(title) tn = TextNormalizer() pos, nouns, ners = semantics(doc) nouns = tn.fmap(nouns) ners = tn.fmap(ners) paragraphs = [] sentences = [] for line in s[1:]: if not line: continue sentences += tn.fmap(sentence_tokenize(line)) paragraphs.append( tn.normalize(line) ) signal.alarm(0) return { "id" : _id, "title" : title, "sents" : sentences, "paras" : paragraphs, "pos" : pos, "nouns" : nouns, "ners" : ners } except Exception as e: print("Could not process article") signal.alarm(0) return None
def get_sents_set(from_date, to_date=None): if not to_date: to_date = from_date tn = TextNormalizer() docs = defaultdict(list) for doc in get_days(from_date, to_date): doc["sents"] = sentence_tokenize(doc["text"], tn.normalize) p = [doc["id"], doc["title"]] + doc["sents"] docs[doc["index"]].append(p) return docs
def stream_titles(category_to_titles): tn = TextNormalizer() fin = defaultdict(list) for category, titles in category_to_titles: for title in titles: try: payload = { "format" : "json", "action" : "query", "titles" : title.strip("\n[#<>[]|{}]"), "prop" : "revisions", "rvprop" : "content" } for r in query( payload ): for _, h1 in r["pages"].items(): print(category, h1["title"]) content = tn.normalize( h1["revisions"][0]["*"] ) append_file(category, [content]) except Exception as e: print(e.__doc__)
def stream_titles(category_to_titles): tn = TextNormalizer() fin = defaultdict(list) for category, titles in category_to_titles: for title in titles: try: payload = { "format": "json", "action": "query", "titles": title.strip("\n[#<>[]|{}]"), "prop": "revisions", "rvprop": "content" } for r in query(payload): for _, h1 in r["pages"].items(): print(category, h1["title"]) content = tn.normalize(h1["revisions"][0]["*"]) append_file(category, [content]) except Exception as e: print(e.__doc__)
def get_semantic_set(from_date, to_date=None): if not to_date: to_date = from_date tn = TextNormalizer() docs = defaultdict(list) for doc in get_days(from_date, to_date): keywords = list(doc["keywords"]) pos = flatten(doc["pos"]) ner = __ners(doc["ner"]) nps = doc["np"] p = [doc["id"], doc["title"]] p += [len(keywords)] + keywords p += [len(pos)] + pos p += [len(ner)] + ner p += [len(nps)] + nps docs[doc["index"]].append(p) return docs
def load_wn_domains(): p = "/Users/sacry/dev/uni/bachelor/data/wordnet-domains-sentiwords/wn-domains/wn-domains-3.2-20070223" domain2synsets = defaultdict(list) synset2domains = defaultdict(list) for i in open(p, 'r'): ssid, doms = i.strip().split('\t') doms = doms.split() synset2domains[ssid] = doms for d in doms: domain2synsets[d].append(ssid) return domain2synsets, synset2domains def synset2id(ss): return str(ss.offset()).zfill(8) + "-" + ss.pos() normalizer = TextNormalizer() PORTER = PorterStemmer() def stem(w): return PORTER.stem(w).lower() def hyper_closure(syn): return list(syn.closure(lambda x: x.hypernyms())) def hypo_closure(syn): return list(syn.closure(lambda x: x.hyponyms())) def n_gram_concepts(closure1, closure2): r = [] for clos1 in closure1: for clos2 in closure2:
def without_noun_func(doc): tn = TextNormalizer() return __stem__( tn.fmap([ word for word, pos in doc["pos"] if pos != 'NNP' and pos != 'NNPS' ] + __ners(doc["ner"])))
def without_noun_func(doc): tn = TextNormalizer() return __stem__( tn.fmap( [word for word, pos in doc["pos"] if pos != 'NNP' and pos != 'NNPS'] + __ners(doc["ner"]) ))