Exemple #1
0
def noun_phrase_func(doc):
    tn = TextNormalizer()
    noun_tokens = tn.fmap(doc["np"])
    title_tokens = tn.tnormalize(doc["title"])
    keyword_tokens = tn.fmap(doc["keywords"])
    ners = __ners(doc["ner"])
    return __stem__(unique(noun_tokens + title_tokens + keyword_tokens + ners))
def noun_phrase_func(doc):
  tn = TextNormalizer()
  noun_tokens = tn.fmap(doc["np"])
  title_tokens = tn.tnormalize(doc["title"])
  keyword_tokens = tn.fmap(doc["keywords"])
  ners = __ners(doc["ner"])
  return __stem__( 
    unique(noun_tokens + title_tokens + keyword_tokens + ners) 
  )
def get_lines(line_per_category):
  tn = TextNormalizer()
  for category in CATEGORIES.keys():
    with open(wiki_path(category), "r+") as f:
      print("Loaded category {}".format(category))
      i = 0
      for line in f:
        if line:
          normalized = tn.normalize(line.lower().strip("\n")).strip()
          if normalized:
            yield (category, normalized)
          i += 1
        if i % line_per_category == 0:
          break
Exemple #4
0
def get_lines(line_per_category):
    tn = TextNormalizer()
    for category in CATEGORIES.keys():
        with open(wiki_path(category), "r+") as f:
            print("Loaded category {}".format(category))
            i = 0
            for line in f:
                if line:
                    normalized = tn.normalize(line.lower().strip("\n")).strip()
                    if normalized:
                        yield (category, normalized)
                    i += 1
                if i % line_per_category == 0:
                    break
  def read(self):
    text_normalizer = TextNormalizer()

    documents = {}
    for tex_path in self.yield_files():
      vec = self.with_open(tex_path)
      vec_norm = [ token for token in 
                   text_normalizer.fmap( vec ) 
                   if token.strip() ]
      vec_stop = self.remove_latex( vec_norm )
      if vec_stop: 
        tex_file = tex_path.split("/")[-1]
        documents[tex_file] = (tex_file, vec_stop) 

    return documents
Exemple #6
0
    def read(self):
        text_normalizer = TextNormalizer()

        documents = {}
        for tex_path in self.yield_files():
            vec = self.with_open(tex_path)
            vec_norm = [
                token for token in text_normalizer.fmap(vec) if token.strip()
            ]
            vec_stop = self.remove_latex(vec_norm)
            if vec_stop:
                tex_file = tex_path.split("/")[-1]
                documents[tex_file] = (tex_file, vec_stop)

        return documents
def bbc_parser(doc, category): 
  try:
    signal.alarm(15)
    s = doc.split("\n")
    title = s[0]
    _id = normalize_title(title)

    tn = TextNormalizer()
    pos, nouns, ners = semantics(doc)
    nouns = tn.fmap(nouns)
    ners = tn.fmap(ners)
    
    paragraphs = []
    sentences = []
    for line in s[1:]:
      if not line:
        continue
      sentences += tn.fmap(sentence_tokenize(line))
      paragraphs.append( tn.normalize(line) )

    signal.alarm(0)
    return {
      "id" : _id,
      "title" : title,
      "sents" : sentences,
      "paras" : paragraphs,
      "pos" : pos,
      "nouns" : nouns,
      "ners" : ners
    }
  except Exception as e:
    print("Could not process article")
  signal.alarm(0)
  return None
Exemple #8
0
def get_sents_set(from_date, to_date=None):
    if not to_date:
        to_date = from_date

    tn = TextNormalizer()

    docs = defaultdict(list)
    for doc in get_days(from_date, to_date):
        doc["sents"] = sentence_tokenize(doc["text"], tn.normalize)
        p = [doc["id"], doc["title"]] + doc["sents"]
        docs[doc["index"]].append(p)

    return docs
def stream_titles(category_to_titles):
  tn = TextNormalizer()
  fin = defaultdict(list)
  for category, titles in category_to_titles:
    for title in titles:
      try:
        payload = {
          "format" : "json",
          "action" : "query",
          "titles" : title.strip("\n[#<>[]|{}]"),
          "prop" : "revisions",
          "rvprop" : "content"
        }

        for r in query( payload ):
          for _, h1 in r["pages"].items():
            print(category, h1["title"])
            content = tn.normalize( h1["revisions"][0]["*"] )
            append_file(category, [content])

      except Exception as e:
        print(e.__doc__)
Exemple #10
0
def stream_titles(category_to_titles):
    tn = TextNormalizer()
    fin = defaultdict(list)
    for category, titles in category_to_titles:
        for title in titles:
            try:
                payload = {
                    "format": "json",
                    "action": "query",
                    "titles": title.strip("\n[#<>[]|{}]"),
                    "prop": "revisions",
                    "rvprop": "content"
                }

                for r in query(payload):
                    for _, h1 in r["pages"].items():
                        print(category, h1["title"])
                        content = tn.normalize(h1["revisions"][0]["*"])
                        append_file(category, [content])

            except Exception as e:
                print(e.__doc__)
Exemple #11
0
def get_semantic_set(from_date, to_date=None):
    if not to_date:
        to_date = from_date

    tn = TextNormalizer()

    docs = defaultdict(list)
    for doc in get_days(from_date, to_date):
        keywords = list(doc["keywords"])
        pos = flatten(doc["pos"])
        ner = __ners(doc["ner"])
        nps = doc["np"]

        p = [doc["id"], doc["title"]]
        p += [len(keywords)] + keywords
        p += [len(pos)] + pos
        p += [len(ner)] + ner
        p += [len(nps)] + nps

        docs[doc["index"]].append(p)
    return docs
def load_wn_domains():
  p = "/Users/sacry/dev/uni/bachelor/data/wordnet-domains-sentiwords/wn-domains/wn-domains-3.2-20070223"
  domain2synsets = defaultdict(list)
  synset2domains = defaultdict(list)
  for i in open(p, 'r'):
    ssid, doms = i.strip().split('\t')
    doms = doms.split()
    synset2domains[ssid] = doms
    for d in doms:
      domain2synsets[d].append(ssid)
  return domain2synsets, synset2domains

def synset2id(ss):
  return str(ss.offset()).zfill(8) + "-" + ss.pos()

normalizer = TextNormalizer()
PORTER = PorterStemmer()
def stem(w):
  return PORTER.stem(w).lower()

def hyper_closure(syn):
  return list(syn.closure(lambda x: x.hypernyms()))

def hypo_closure(syn):
  return list(syn.closure(lambda x: x.hyponyms()))


def n_gram_concepts(closure1, closure2):
  r = []
  for clos1 in closure1:
    for clos2 in closure2:
Exemple #13
0
def without_noun_func(doc):
    tn = TextNormalizer()
    return __stem__(
        tn.fmap([
            word for word, pos in doc["pos"] if pos != 'NNP' and pos != 'NNPS'
        ] + __ners(doc["ner"])))
def without_noun_func(doc):
  tn = TextNormalizer()
  return __stem__( tn.fmap( 
    [word for word, pos in doc["pos"] 
     if pos != 'NNP' and pos != 'NNPS'] + __ners(doc["ner"])
  ))