def __init__(self, parseStructure, root=True): self.root = root if root: meta = parseStructure["root"]["meta"] else: meta = parseStructure["meta"] self.title = meta["title"] self.authors = meta["authors"] self.terms = meta["terms"] self.categories = meta["categories"] self.abstract = meta["abstract"] if root: self.fullText = parseStructure["root"]["fullText"] self.pdfPath = parseStructure["root"]["pdfPath"] else: self.fullText = parseStructure["fullText"] self.pdfPath = parseStructure["pdfPath"] if root: self.noCitationsText = parseStructure["root"]["noCitationsText"] self.noNumbersText = parseStructure["root"]["noNumbersText"] self.citationKey = parseStructure["root"]["citationKey"] self.citations = parseStructure["root"]["citations"] # Do pre-processing on the citations. for citation in self.citations: # proper nouns citation.sentenceProperNouns = util.removeStopwords( util.getCapitalWords(citation.sentenceContext.noCitations) ) citation.paragraphProperNouns = util.removeStopwords( util.getCapitalWords(citation.paragraphContext.noCitations) ) # bigrams citation.sentenceBigrams = util.getNonStopNgrams(citation.sentenceContext.noCitations, 2) citation.paragraphBigrams = util.getNonStopNgrams(citation.paragraphContext.noCitations, 2) # Add important unigrams to the proper nouns citation.sentenceProperNouns.update(util.importantUnigrams(citation.sentenceBigrams)) citation.paragraphProperNouns.update(util.importantUnigrams(citation.paragraphBigrams)) # This is a dict to accomodate missing references, and index by 1. self.references = {} for reference in parseStructure["references"].items(): self.references[reference[0]] = Paper(reference[1], False)
def __init__(self, paper): super(BaseAbstractMethod, self).__init__() if BaseAbstractMethod.abstractWords.has_key(paper.title): return paperAbstractWords = {} paperAbstractBigrams = {} for (key, reference) in paper.references.items(): paperAbstractWords[key] = util.getCapitalWords(reference.abstract) #paperAbstractWords[key] = util.removeStopwords(set(util.wordSplit(reference.abstract))) paperAbstractBigrams[key] = util.getNonStopNgrams(reference.abstract, 2) paperAbstractWords = util.uniqueSets(paperAbstractWords) paperAbstractBigrams = util.uniqueSets(paperAbstractBigrams) # If a word appears in >= 25% of bigrams, then put it in the unigrams. for (referenceKey, bigrams) in paperAbstractBigrams.items(): counts = {} for bigram in bigrams: for word in bigram.split('-'): word = util.STEMMER.stem(word) if not counts.has_key(word): counts[word] = 1 else: counts[word] += 1 for (word, count) in counts.items(): if float(count) / len(bigrams) >= 0.25: paperAbstractWords[referenceKey].add(word) if DEBUG: print "ABSTRACT:" for (ref, nouns) in paperAbstractWords.items(): print "{0}\n\tWords -- {1}\n\tBigrams -- {2}".format(ref, nouns, paperAbstractBigrams[ref]) BaseAbstractMethod.abstractWords[paper.title] = paperAbstractWords BaseAbstractMethod.abstractBigrams[paper.title] = paperAbstractBigrams
def __init__(self, paper): super(BaseTitleAuthorMethod, self).__init__() if BaseTitleAuthorMethod.properNouns.has_key(paper.title): return # Pre-build a list of all proper nouns in the references' title and authors. paperProperNouns = {} # TODO(eriq): remove stopwords and try to remove other capitilization false positives. for (key, reference) in paper.references.items(): # Make sure to avoid initials (and double initials). authorNouns = re.findall('\w{3,}', ' '.join(reference.authors)) paperProperNouns[key] = set([authorNoun.upper() for authorNoun in authorNouns]) paperProperNouns[key] |= util.removeTitleStopwords(util.getCapitalWords(reference.title)) paperProperNouns = util.uniqueSets(paperProperNouns, 1) if DEBUG: print "Title/Author:" for (ref, nouns) in paperProperNouns.items(): print "{0} -- {1}".format(ref, nouns) BaseTitleAuthorMethod.properNouns[paper.title] = paperProperNouns