def classifyOpportunity(self, title, info):
        unigramsTitle = TokenizeOnWhitespacePunctuation(
            title, applyStopwords=True).getUnigrams()
        unigramsInfo = TokenizeOnWhitespacePunctuation(
            info, applyStopwords=True).getUnigrams()

        if self.checkFellowshipKeywordsTitle(unigramsTitle):
            tag = 'Fellowship'
        elif self.checkInternshipKeywords(unigramsTitle):
            tag = 'Internship'
        elif self.checkScholarshipKeywords(unigramsTitle):
            tag = 'Scholarship'
        elif self.checkGrantKeywords(unigramsTitle):
            tag = 'Grant'
        elif self.checkFellowshipKeywordsInfo(unigramsInfo):
            tag = 'Fellowship'
        elif self.checkInternshipKeywords(unigramsInfo):
            tag = 'Internship'
        elif self.checkGrantKeywords(unigramsInfo):
            tag = 'Grant'
        elif self.checkAwardKeywords(unigramsTitle):
            tag = 'Award'
        elif self.checkScholarshipKeywords(unigramsInfo):
            tag = 'Scholarship'
        elif self.checkResearchKeywords(unigramsTitle):
            tag = 'Research'
        elif self.checkResearchKeywords(unigramsInfo):
            tag = 'Research'
        else:
            tag = 'Other'

        return tag
Exemple #2
0
    def getNgrams(text, getUnigrams=True, getBigrams=True, getTrigrams=False):
        unigrams = []
        bigrams = []
        trigrams = []

        sentences = TokenizeIntoSentences().doTokenize(text)
        for sentence in sentences:
            sentenceUnigrams = TokenizeOnWhitespacePunctuation(
                sentence, keepCaps=False, applyStopwords=True).getUnigrams()
            if getUnigrams:
                for sentenceUnigram in sentenceUnigrams:
                    unigrams.append(sentenceUnigram)

            if getBigrams:
                sentenceBigrams = [
                    '%s %s' % (sentenceUnigrams[i], sentenceUnigrams[i + 1])
                    for i in range(len(sentenceUnigrams) - 1)
                ]
                for sentenceBigram in sentenceBigrams:
                    bigrams.append(sentenceBigram)

            if getTrigrams:
                sentenceTrigrams = [
                    '%s %s %s' % (sentenceUnigrams[i], sentenceUnigrams[i + 1],
                                  sentenceUnigrams[i + 2])
                    for i in range(len(sentenceUnigrams) - 2)
                ]
                for sentenceTrigram in sentenceTrigrams:
                    trigrams.append(sentenceTrigram)

        ngramsList = [unigrams, bigrams, trigrams]

        return ngramsList
Exemple #3
0
    def test_RemoveUrls(self):
        # set up
        teststring = 'I like cats cats.org'
        unigrams = ['i', 'like', 'cats']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(unigrams, testtokenize.getUnigrams())
Exemple #4
0
    def test_TokenizeOnWhitespacePunctuationUnigrams(self):
        # set up
        teststring = 'I like cats and birds.'
        unigrams = ['i', 'like', 'cats', 'and', 'birds']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(unigrams, testtokenize.getUnigrams())
Exemple #5
0
    def test_BothUnigramsBigramsApplyStopwords(self):
        # set up
        teststring = 'I like cats and birds.'
        both = ['cats', 'birds', 'cats birds']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring,
                                                       applyStopwords=True)
        self.assertEqual(both, testtokenize.getBothUnigramsBigrams())
Exemple #6
0
    def test_TokenizeOnWhitespacePunctuationBothUnigramsBigrams(self):
        # set up
        teststring = 'I like cats and birds.'
        both = [
            'i', 'like', 'cats', 'and', 'birds', 'i like', 'like cats',
            'cats and', 'and birds'
        ]

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(both, testtokenize.getBothUnigramsBigrams())