Ejemplo n.º 1
0
    def __init__(
        self,
        paragraph,
        noCitesParagraph,
        noNumbersParagraph,
        markedParagraph,
        sentence,
        noCitesSentence,
        noNumbersSentence,
        markedSentence,
        citesPerParagraph,
        citesPerSentence,
    ):
        self.sentenceContext = Context(sentence, noCitesSentence, noNumbersSentence, markedSentence, citesPerSentence)
        self.paragraphContext = Context(
            paragraph, noCitesParagraph, noNumbersParagraph, markedParagraph, citesPerParagraph
        )

        # Figure out the pre context
        self.preContext = ""
        # Check for a para surround first.
        match = re.search("\(([^\)]*{}[^\(]*)\)".format(re.escape(parser.MARKED_CITATION_MARKER)), markedSentence)
        if match:
            self.preContext = match.group(1).replace(parser.MARKED_CITATION_MARKER, " ").strip()
        else:
            match = re.search(
                '([^\[\]\.,;:!"\?\-]+){}'.format(re.escape(parser.MARKED_CITATION_MARKER)), markedSentence
            )

            if match:
                self.preContext = match.group(1).strip()

        self.preContextUnigrams = util.getNonStopNgrams(self.preContext, 1)
        self.preContextBigrams = util.getNonStopNgrams(self.preContext, 2)
Ejemplo n.º 2
0
    def __init__(self, parseStructure, root=True):
        self.root = root

        if root:
            meta = parseStructure["root"]["meta"]
        else:
            meta = parseStructure["meta"]

        self.title = meta["title"]
        self.authors = meta["authors"]
        self.terms = meta["terms"]
        self.categories = meta["categories"]
        self.abstract = meta["abstract"]

        if root:
            self.fullText = parseStructure["root"]["fullText"]
            self.pdfPath = parseStructure["root"]["pdfPath"]
        else:
            self.fullText = parseStructure["fullText"]
            self.pdfPath = parseStructure["pdfPath"]

        if root:
            self.noCitationsText = parseStructure["root"]["noCitationsText"]
            self.noNumbersText = parseStructure["root"]["noNumbersText"]
            self.citationKey = parseStructure["root"]["citationKey"]
            self.citations = parseStructure["root"]["citations"]

            # Do pre-processing on the citations.
            for citation in self.citations:
                # proper nouns
                citation.sentenceProperNouns = util.removeStopwords(
                    util.getCapitalWords(citation.sentenceContext.noCitations)
                )
                citation.paragraphProperNouns = util.removeStopwords(
                    util.getCapitalWords(citation.paragraphContext.noCitations)
                )

                # bigrams
                citation.sentenceBigrams = util.getNonStopNgrams(citation.sentenceContext.noCitations, 2)
                citation.paragraphBigrams = util.getNonStopNgrams(citation.paragraphContext.noCitations, 2)

                # Add important unigrams to the proper nouns
                citation.sentenceProperNouns.update(util.importantUnigrams(citation.sentenceBigrams))
                citation.paragraphProperNouns.update(util.importantUnigrams(citation.paragraphBigrams))

            # This is a dict to accomodate missing references, and index by 1.
            self.references = {}
            for reference in parseStructure["references"].items():
                self.references[reference[0]] = Paper(reference[1], False)
Ejemplo n.º 3
0
   def __init__(self, paper):
      super(BaseAbstractMethod, self).__init__()

      if BaseAbstractMethod.abstractWords.has_key(paper.title):
         return

      paperAbstractWords = {}
      paperAbstractBigrams = {}

      for (key, reference) in paper.references.items():
         paperAbstractWords[key] = util.getCapitalWords(reference.abstract)
         #paperAbstractWords[key] = util.removeStopwords(set(util.wordSplit(reference.abstract)))
         paperAbstractBigrams[key] = util.getNonStopNgrams(reference.abstract, 2)

      paperAbstractWords = util.uniqueSets(paperAbstractWords)
      paperAbstractBigrams = util.uniqueSets(paperAbstractBigrams)

      # If a word appears in >= 25% of bigrams, then put it in the unigrams.
      for (referenceKey, bigrams) in paperAbstractBigrams.items():
         counts = {}
         for bigram in bigrams:
            for word in bigram.split('-'):
               word = util.STEMMER.stem(word)
               if not counts.has_key(word):
                  counts[word] = 1
               else:
                  counts[word] += 1
         for (word, count) in counts.items():
            if float(count) / len(bigrams) >= 0.25:
               paperAbstractWords[referenceKey].add(word)

      if DEBUG:
         print "ABSTRACT:"
         for (ref, nouns) in paperAbstractWords.items():
            print "{0}\n\tWords -- {1}\n\tBigrams -- {2}".format(ref, nouns, paperAbstractBigrams[ref])

      BaseAbstractMethod.abstractWords[paper.title] = paperAbstractWords
      BaseAbstractMethod.abstractBigrams[paper.title] = paperAbstractBigrams