Beispiel #1
0
    def __init__(self, parseStructure, root=True):
        self.root = root

        if root:
            meta = parseStructure["root"]["meta"]
        else:
            meta = parseStructure["meta"]

        self.title = meta["title"]
        self.authors = meta["authors"]
        self.terms = meta["terms"]
        self.categories = meta["categories"]
        self.abstract = meta["abstract"]

        if root:
            self.fullText = parseStructure["root"]["fullText"]
            self.pdfPath = parseStructure["root"]["pdfPath"]
        else:
            self.fullText = parseStructure["fullText"]
            self.pdfPath = parseStructure["pdfPath"]

        if root:
            self.noCitationsText = parseStructure["root"]["noCitationsText"]
            self.noNumbersText = parseStructure["root"]["noNumbersText"]
            self.citationKey = parseStructure["root"]["citationKey"]
            self.citations = parseStructure["root"]["citations"]

            # Do pre-processing on the citations.
            for citation in self.citations:
                # proper nouns
                citation.sentenceProperNouns = util.removeStopwords(
                    util.getCapitalWords(citation.sentenceContext.noCitations)
                )
                citation.paragraphProperNouns = util.removeStopwords(
                    util.getCapitalWords(citation.paragraphContext.noCitations)
                )

                # bigrams
                citation.sentenceBigrams = util.getNonStopNgrams(citation.sentenceContext.noCitations, 2)
                citation.paragraphBigrams = util.getNonStopNgrams(citation.paragraphContext.noCitations, 2)

                # Add important unigrams to the proper nouns
                citation.sentenceProperNouns.update(util.importantUnigrams(citation.sentenceBigrams))
                citation.paragraphProperNouns.update(util.importantUnigrams(citation.paragraphBigrams))

            # This is a dict to accomodate missing references, and index by 1.
            self.references = {}
            for reference in parseStructure["references"].items():
                self.references[reference[0]] = Paper(reference[1], False)
Beispiel #2
0
   def __init__(self, paper):
      super(BaseAbstractMethod, self).__init__()

      if BaseAbstractMethod.abstractWords.has_key(paper.title):
         return

      paperAbstractWords = {}
      paperAbstractBigrams = {}

      for (key, reference) in paper.references.items():
         paperAbstractWords[key] = util.getCapitalWords(reference.abstract)
         #paperAbstractWords[key] = util.removeStopwords(set(util.wordSplit(reference.abstract)))
         paperAbstractBigrams[key] = util.getNonStopNgrams(reference.abstract, 2)

      paperAbstractWords = util.uniqueSets(paperAbstractWords)
      paperAbstractBigrams = util.uniqueSets(paperAbstractBigrams)

      # If a word appears in >= 25% of bigrams, then put it in the unigrams.
      for (referenceKey, bigrams) in paperAbstractBigrams.items():
         counts = {}
         for bigram in bigrams:
            for word in bigram.split('-'):
               word = util.STEMMER.stem(word)
               if not counts.has_key(word):
                  counts[word] = 1
               else:
                  counts[word] += 1
         for (word, count) in counts.items():
            if float(count) / len(bigrams) >= 0.25:
               paperAbstractWords[referenceKey].add(word)

      if DEBUG:
         print "ABSTRACT:"
         for (ref, nouns) in paperAbstractWords.items():
            print "{0}\n\tWords -- {1}\n\tBigrams -- {2}".format(ref, nouns, paperAbstractBigrams[ref])

      BaseAbstractMethod.abstractWords[paper.title] = paperAbstractWords
      BaseAbstractMethod.abstractBigrams[paper.title] = paperAbstractBigrams
Beispiel #3
0
   def __init__(self, paper):
      super(BaseTitleAuthorMethod, self).__init__()

      if BaseTitleAuthorMethod.properNouns.has_key(paper.title):
         return

      # Pre-build a list of all proper nouns in the references' title and authors.
      paperProperNouns = {}

      # TODO(eriq): remove stopwords and try to remove other capitilization false positives.
      for (key, reference) in paper.references.items():
         # Make sure to avoid initials (and double initials).
         authorNouns = re.findall('\w{3,}', ' '.join(reference.authors))
         paperProperNouns[key] = set([authorNoun.upper() for authorNoun in authorNouns])
         paperProperNouns[key] |= util.removeTitleStopwords(util.getCapitalWords(reference.title))

      paperProperNouns = util.uniqueSets(paperProperNouns, 1)

      if DEBUG:
         print "Title/Author:"
         for (ref, nouns) in paperProperNouns.items():
            print "{0} -- {1}".format(ref, nouns)

      BaseTitleAuthorMethod.properNouns[paper.title] = paperProperNouns