def __init__(self, paper): super(BaseAbstractMethod, self).__init__() if BaseAbstractMethod.abstractWords.has_key(paper.title): return paperAbstractWords = {} paperAbstractBigrams = {} for (key, reference) in paper.references.items(): paperAbstractWords[key] = util.getCapitalWords(reference.abstract) #paperAbstractWords[key] = util.removeStopwords(set(util.wordSplit(reference.abstract))) paperAbstractBigrams[key] = util.getNonStopNgrams(reference.abstract, 2) paperAbstractWords = util.uniqueSets(paperAbstractWords) paperAbstractBigrams = util.uniqueSets(paperAbstractBigrams) # If a word appears in >= 25% of bigrams, then put it in the unigrams. for (referenceKey, bigrams) in paperAbstractBigrams.items(): counts = {} for bigram in bigrams: for word in bigram.split('-'): word = util.STEMMER.stem(word) if not counts.has_key(word): counts[word] = 1 else: counts[word] += 1 for (word, count) in counts.items(): if float(count) / len(bigrams) >= 0.25: paperAbstractWords[referenceKey].add(word) if DEBUG: print "ABSTRACT:" for (ref, nouns) in paperAbstractWords.items(): print "{0}\n\tWords -- {1}\n\tBigrams -- {2}".format(ref, nouns, paperAbstractBigrams[ref]) BaseAbstractMethod.abstractWords[paper.title] = paperAbstractWords BaseAbstractMethod.abstractBigrams[paper.title] = paperAbstractBigrams
def __init__(self, paper): super(BaseTitleAuthorMethod, self).__init__() if BaseTitleAuthorMethod.properNouns.has_key(paper.title): return # Pre-build a list of all proper nouns in the references' title and authors. paperProperNouns = {} # TODO(eriq): remove stopwords and try to remove other capitilization false positives. for (key, reference) in paper.references.items(): # Make sure to avoid initials (and double initials). authorNouns = re.findall('\w{3,}', ' '.join(reference.authors)) paperProperNouns[key] = set([authorNoun.upper() for authorNoun in authorNouns]) paperProperNouns[key] |= util.removeTitleStopwords(util.getCapitalWords(reference.title)) paperProperNouns = util.uniqueSets(paperProperNouns, 1) if DEBUG: print "Title/Author:" for (ref, nouns) in paperProperNouns.items(): print "{0} -- {1}".format(ref, nouns) BaseTitleAuthorMethod.properNouns[paper.title] = paperProperNouns