Esempio n. 1
0
   def __init__(self, paper):
      super(BaseAbstractMethod, self).__init__()

      if BaseAbstractMethod.abstractWords.has_key(paper.title):
         return

      paperAbstractWords = {}
      paperAbstractBigrams = {}

      for (key, reference) in paper.references.items():
         paperAbstractWords[key] = util.getCapitalWords(reference.abstract)
         #paperAbstractWords[key] = util.removeStopwords(set(util.wordSplit(reference.abstract)))
         paperAbstractBigrams[key] = util.getNonStopNgrams(reference.abstract, 2)

      paperAbstractWords = util.uniqueSets(paperAbstractWords)
      paperAbstractBigrams = util.uniqueSets(paperAbstractBigrams)

      # If a word appears in >= 25% of bigrams, then put it in the unigrams.
      for (referenceKey, bigrams) in paperAbstractBigrams.items():
         counts = {}
         for bigram in bigrams:
            for word in bigram.split('-'):
               word = util.STEMMER.stem(word)
               if not counts.has_key(word):
                  counts[word] = 1
               else:
                  counts[word] += 1
         for (word, count) in counts.items():
            if float(count) / len(bigrams) >= 0.25:
               paperAbstractWords[referenceKey].add(word)

      if DEBUG:
         print "ABSTRACT:"
         for (ref, nouns) in paperAbstractWords.items():
            print "{0}\n\tWords -- {1}\n\tBigrams -- {2}".format(ref, nouns, paperAbstractBigrams[ref])

      BaseAbstractMethod.abstractWords[paper.title] = paperAbstractWords
      BaseAbstractMethod.abstractBigrams[paper.title] = paperAbstractBigrams
Esempio n. 2
0
   def __init__(self, paper):
      super(BaseTitleAuthorMethod, self).__init__()

      if BaseTitleAuthorMethod.properNouns.has_key(paper.title):
         return

      # Pre-build a list of all proper nouns in the references' title and authors.
      paperProperNouns = {}

      # TODO(eriq): remove stopwords and try to remove other capitilization false positives.
      for (key, reference) in paper.references.items():
         # Make sure to avoid initials (and double initials).
         authorNouns = re.findall('\w{3,}', ' '.join(reference.authors))
         paperProperNouns[key] = set([authorNoun.upper() for authorNoun in authorNouns])
         paperProperNouns[key] |= util.removeTitleStopwords(util.getCapitalWords(reference.title))

      paperProperNouns = util.uniqueSets(paperProperNouns, 1)

      if DEBUG:
         print "Title/Author:"
         for (ref, nouns) in paperProperNouns.items():
            print "{0} -- {1}".format(ref, nouns)

      BaseTitleAuthorMethod.properNouns[paper.title] = paperProperNouns