Ejemplo n.º 1
0
def filterAdj(phrasesDict,filename):
	phrasesDict = OrderedDict(sorted(phrasesDict.items(), key=operator.itemgetter(1), reverse=True))
	newPhrases = dict()
	exclude = set(string.punctuation)
	exclude.remove("_")
	for line_words, count in phrasesDict.items():
		#Preprocessing text
		line_words = ' '.join([apostropheList[word] if word in apostropheList else word for word in line_words.split()])
		line_words = ''.join(ch for ch in line_words if ch not in exclude)
		line_words = re.sub(r' [a-z][$]? ', ' ', line_words)
		line_words = [Word(word).lemmatize() for word in line_words.split() if(word not in stopwords.words("english") and not word.isdigit()) and len(word) > 2]
		line_words = ' '.join(line_words)
		if(len(line_words.strip(" ").split()) == 2):
			if(line_words in newPhrases):
				newPhrases[line_words] += count
			else:
				newPhrases[line_words] = count
	#Bigrams from the file
	newPhrases = OrderedDict(sorted(newPhrases.items(), key=operator.itemgetter(1), reverse=True))

	#Applying Threshold to Bigrams
	nouns1 = []
	for key, value in newPhrases.items():
		if value >= 3:
			nouns1.append(key)


	stopWords = stopwords.words("english")
	exclude = set(string.punctuation)
	reviewTitle = []
	reviewContent = []

	#Reading the original file
	with open(filename) as f:
		review = []
		for line in f:
			if line[:6] == "[+][t]":
				if review:
					reviewContent.append(review)
					review = []
				reviewTitle.append(line.split("[+][t]")[1].rstrip("\r\n"))
			elif line[:6] == "[-][t]":
				if review:
					reviewContent.append(review)
					review = []
				reviewTitle.append(line.split("[-][t]")[1].rstrip("\r\n"))

			else:
				if "##" in line:
					x = line.split("##")
					#if len(x[0]) != 0:
					for i in range(1, len(x)):
						review.append(x[i].rstrip("\r\n"))
				else:
					continue
		reviewContent.append(review)

		#tb = Blobber(pos_tagger=PerceptronTagger()) 
		tb = Blobber(pos_tagger=NLTKTagger())
		nounScores = dict()

		#Writing to a file
		f = open('modified.txt', 'w')
		for a in range(len(reviewContent)):
			f.write("[t]")
			
			#Finding Bigrams in title
			text = reviewTitle[a]
		
			x = tb(text).tags #NLTK tagger		
			e = 0
				
			while e<len(x):
				tagList = ""
				temp = ""
				wrt = x[e][0]
				e = e+1
				count = e
				tp = 0
				if(count<len(x) and (x[count-1][1] == "NN" or "JJ") and (x[count][1] == "NN" or "JJ")):
					tagList = x[count-1][0] + " " + x[count][0]
					temp = x[count][0]
					count = count+1
				if tagList != "":
					if tagList in nouns1: 
						tagList = tagList.replace(' ', '')
						f.write(tagList)
						tp = 1
						e = count
				if tp == 0:
					f.write(wrt)
				f.write(" ")			


			f.write("\r\n")	
								
			#Finding bigrams in review
			for i in range(len(reviewContent[a])):
				text = reviewContent[a][i]
				x = tb(text).tags #NLTK tagger
				
				tagList = []
				e = 0
				f.write("##")

				while e<len(x):
					tagList = ""
					temp = ""
					wrt = x[e][0]
					e = e+1
					count = e
					tp = 0
					if(count<len(x) and (x[count-1][1] == "NN" or "JJ") and (x[count][1] == "NN" or "JJ")):
						tagList = x[count-1][0] + " " + x[count][0]
						temp = x[count][0]
						count = count+1
					if tagList != "":
						#Checking if consecutive nouns we found out are in noun phrases
						if tagList in nouns1: 
							tagList = tagList.replace(' ', '')
							f.write(tagList)
							tp = 1
							e = count
					if tp == 0:
						f.write(wrt)
					f.write(" ")
				f.write(".\r\n")
Ejemplo n.º 2
0
 def test_can_use_different_pos_tagger(self):
     tagger = NLTKTagger()
     blob = tb.TextBlob("this is some text", pos_tagger=tagger)
     assert_true(isinstance(blob.pos_tagger, NLTKTagger))
Ejemplo n.º 3
0
def findFeatures(reviewContent,filename):
    nounScores = dict()

    adjDict = dict()
    tb = Blobber(pos_tagger=NLTKTagger())

    for a in range(len(reviewContent)):  # Stores the score of the nouns
        #print("printing words::::")
        #print(reviewContent[a])
        text = ' '.join([word for word in reviewContent[a].split() if word not in stopwords.words("english")])
        text = ''.join(ch for ch in text if ch not in exclude)
        text = nltk.word_tokenize(text)
        x = nltk.pos_tag(text)

        # Get the noun/adjective words and store it in tagList
        tagList = []
        for e in x:
            if (e[1] == "NN" or e[1] == "JJ"):
                tagList.append(e)

        # Add the nouns(which are not in the nounScores dict) to the dict
        for e in tagList:
            if e[1] == "NN":
                if e[0] not in nounScores:
                    nounScores[e[0]] = 0

        # For every adjective, find nearby noun
        for l in range(len(tagList)):
            if ("JJ" in tagList[l][1]):
                j = k = leftHop = rightHop = -1

                for j in range(l + 1, len(tagList)):
                    if (j == l + maxHops):
                        break
                    if ("NN" in tagList[j][1]):
                        rightHop = (j - l)
                        break

                for k in range(l - 1, -1, -1):
                    if (j == l - maxHops):
                        break
                    if ("NN" in tagList[k][1]):
                        leftHop = (l - k)
                        break

                # Compare which noun is closer to adjective(left or right) and assign the adj to corresponding noun
                if (leftHop > 0 and rightHop > 0):
                    if (leftHop - rightHop) >= 0:
                        adjDict[tagList[l][0]] = tagList[j][0]
                        nounScores[tagList[j][0]] += 1
                    else:
                        adjDict[tagList[l][0]] = tagList[k][0]
                        nounScores[tagList[k][0]] += 1
                elif leftHop > 0:
                    adjDict[tagList[l][0]] = tagList[k][0]
                    nounScores[tagList[k][0]] += 1
                elif rightHop > 0:
                    adjDict[tagList[l][0]] = tagList[j][0]
                    nounScores[tagList[j][0]] += 1

    nounScores = OrderedDict(sorted(nounScores.items(), key=operator.itemgetter(1)))
    return filterAdj(nounScores, adjDict, filename)
Ejemplo n.º 4
0
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.taggers import NLTKTagger

# Setup
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('brown')
# nltk.download('movie_reviews')

# Load config
import json

rx = re.compile('(["#\'\\%`])')
tb = Blobber(pos_tagger=NLTKTagger(), analyzer=NaiveBayesAnalyzer())


data = pd.read_csv('/media/salah/e58c5812-2860-4033-90c6-83b7ffaa8b88/MLStock/dataset/Layer1_dataset/Model2/Layer1_base_dataset.csv')
# Keeping only the neccessary columns

data['headline'] = data['headline'].apply(lambda x: str(x).lower().replace(' ## ',''))

from nltk.corpus import sentiwordnet as swn
#result_reduce[1].split(',')[0]
def sent_from_text(text):
    test_b = tb( text )
    pos_count = 0.0
    neg_count = 0.0
    pos_sum = 0.0
    neg_sum = 0.0
Ejemplo n.º 5
0
    Calculate the semantic similarity between two sentences. The last
    parameter is True or False depending on whether information content
    normalization is desired or not.
    """
    return DELTA * semantic_similarity(sentence_1, sentence_2, info_content_norm) + \
           (1.0 - DELTA) * word_order_similarity(sentence_1, sentence_2)


val = similarity("I hate Trump", "I like Trump", False)

if val > .7:
    print("close enough")
else:
    print("not even")

nltk_tagger = NLTKTagger()
#add this to docker 'python -m textblob.download_corpora'


def analyze(content):
    zen = TextBlob(content)
    overall_total = 0.0
    overall_score = 0.0
    for sent in zen.sentences:
        res = query(sent)
        overall_total += res[0]
        overall_score += res[1]

    return [overall_total, overall_score]

Ejemplo n.º 6
0
class BaseBlob(StringlikeMixin, BlobComparableMixin):
    """An abstract base class that all textblob classes will inherit from.
    Includes words, POS tag, NP, and word count properties. Also includes
    basic dunder and string methods for making objects like Python strings.

    :param text: A string.
    :param tokenizer: (optional) A tokenizer instance. If ``None``,
        defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``,
        defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``,
        defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
    :param parser: A parser. If ``None``, defaults to
        :class:`PatternParser <textblob.en.parsers.PatternParser>`.
    :param classifier: A classifier.

    .. versionchanged:: 0.6.0
        ``clean_html`` parameter deprecated, as it was in NLTK.
    """
    np_extractor = FastNPExtractor()
    pos_tagger = NLTKTagger()
    tokenizer = WordTokenizer()
    translator = Translator()
    analyzer = PatternAnalyzer()
    parser = PatternParser()

    def __init__(self,
                 text,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None,
                 clean_html=False):
        if not isinstance(text, basestring):
            raise TypeError('The `text` argument passed to `__init__(text)` '
                            'must be a string, not {0}'.format(type(text)))
        if clean_html:
            raise NotImplementedError(
                "clean_html has been deprecated. "
                "To remove HTML markup, use BeautifulSoup's "
                "get_text() function")
        self.raw = self.string = text
        self.stripped = lowerstrip(self.raw, all=True)
        _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                           parser, classifier)

    @cached_property
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.
        """
        return WordList(word_tokenize(self.raw, include_punc=False))

    @cached_property
    def tokens(self):
        """Return a list of tokens, using this blob's tokenizer object
        (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
        """
        return WordList(self.tokenizer.tokenize(self.raw))

    def tokenize(self, tokenizer=None):
        """Return a list of tokens, using ``tokenizer``.

        :param tokenizer: (optional) A tokenizer object. If None, defaults to
            this blob's default tokenizer.
        """
        t = tokenizer if tokenizer is not None else self.tokenizer
        return WordList(t.tokenize(self.raw))

    def parse(self, parser=None):
        """Parse the text.

        :param parser: (optional) A parser instance. If ``None``, defaults to
            this blob's default parser.

        .. versionadded:: 0.6.0
        """
        p = parser if parser is not None else self.parser
        return p.parse(self.raw)

    def classify(self):
        """Classify the blob using the blob's ``classifier``."""
        if self.classifier is None:
            raise NameError("This blob has no classifier. Train one first!")
        return self.classifier.classify(self.raw)

    @cached_property
    def sentiment(self):
        """Return a tuple of form (polarity, subjectivity ) where polarity
        is a float within the range [-1.0, 1.0] and subjectivity is a float
        within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
        very subjective.

        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity)``
        """
        return self.analyzer.analyze(self.raw)

    @cached_property
    def sentiment_assessments(self):
        """Return a tuple of form (polarity, subjectivity, assessments ) where
        polarity is a float within the range [-1.0, 1.0], subjectivity is a
        float within the range [0.0, 1.0] where 0.0 is very objective and 1.0
        is very subjective, and assessments is a list of polarity and
        subjectivity scores for the assessed tokens.

        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity,
        assessments)``
        """
        return self.analyzer.analyze(self.raw, keep_assessments=True)

    @cached_property
    def polarity(self):
        """Return the polarity score as a float within the range [-1.0, 1.0]

        :rtype: float
        """
        return PatternAnalyzer().analyze(self.raw)[0]

    @cached_property
    def subjectivity(self):
        """Return the subjectivity score as a float within the range [0.0, 1.0]
        where 0.0 is very objective and 1.0 is very subjective.

        :rtype: float
        """
        return PatternAnalyzer().analyze(self.raw)[1]

    @cached_property
    def noun_phrases(self):
        """Returns a list of noun phrases for this blob."""
        return WordList([
            phrase.strip().lower()
            for phrase in self.np_extractor.extract(self.raw)
            if len(phrase) > 1
        ])

    @cached_property
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        if isinstance(self, TextBlob):
            return [
                val for sublist in [s.pos_tags for s in self.sentences]
                for val in sublist
            ]
        else:
            return [(Word(word, pos_tag=t), unicode(t))
                    for word, t in self.pos_tagger.tag(self)
                    if not PUNCTUATION_REGEX.match(unicode(t))]

    tags = pos_tags

    @cached_property
    def word_counts(self):
        """Dictionary of word frequencies in this text.
        """
        counts = defaultdict(int)
        stripped_words = [lowerstrip(word) for word in self.words]
        for word in stripped_words:
            counts[word] += 1
        return counts

    @cached_property
    def np_counts(self):
        """Dictionary of noun phrase frequencies in this text.
        """
        counts = defaultdict(int)
        for phrase in self.noun_phrases:
            counts[phrase] += 1
        return counts

    def ngrams(self, n=3):
        """Return a list of n-grams (tuples of n successive words) for this
        blob.

        :rtype: List of :class:`WordLists <WordList>`
        """
        if n <= 0:
            return []
        grams = [
            WordList(self.words[i:i + n])
            for i in range(len(self.words) - n + 1)
        ]
        return grams

    def translate(self, from_lang="auto", to="en"):
        """Translate the blob to another language.
        Uses the Google Translate API. Returns a new TextBlob.

        Requires an internet connection.

        Usage:
        ::

            >>> b = TextBlob("Simple is better than complex")
            >>> b.translate(to="es")
            TextBlob('Lo simple es mejor que complejo')

        Language code reference:
            https://developers.google.com/translate/v2/using_rest#language-params

        .. versionadded:: 0.5.0.

        :param str from_lang: Language to translate from. If ``None``, will attempt
            to detect the language.
        :param str to: Language to translate to.
        :rtype: :class:`BaseBlob <BaseBlob>`
        """
        return self.__class__(
            self.translator.translate(self.raw,
                                      from_lang=from_lang,
                                      to_lang=to))

    def detect_language(self):
        """Detect the blob's language using the Google Translate API.

        Requires an internet connection.

        Usage:
        ::

            >>> b = TextBlob("bonjour")
            >>> b.detect_language()
            u'fr'

        Language code reference:
            https://developers.google.com/translate/v2/using_rest#language-params

        .. versionadded:: 0.5.0

        :rtype: str
        """
        return self.translator.detect(self.raw)

    def correct(self):
        """Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: :class:`BaseBlob <BaseBlob>`
        """
        # regex matches: word or punctuation or whitespace
        tokens = nltk.tokenize.regexp_tokenize(self.raw, r"\w+|[^\w\s]|\s")
        corrected = (Word(w).correct() for w in tokens)
        ret = ''.join(corrected)
        return self.__class__(ret)

    def _cmpkey(self):
        """Key used by ComparableMixin to implement all rich comparison
        operators.
        """
        return self.raw

    def _strkey(self):
        """Key used by StringlikeMixin to implement string methods."""
        return self.raw

    def __hash__(self):
        return hash(self._cmpkey())

    def __add__(self, other):
        '''Concatenates two text objects the same way Python strings are
        concatenated.

        Arguments:
        - `other`: a string or a text object
        '''
        if isinstance(other, basestring):
            return self.__class__(self.raw + other)
        elif isinstance(other, BaseBlob):
            return self.__class__(self.raw + other.raw)
        else:
            raise TypeError(
                'Operands must be either strings or {0} objects'.format(
                    self.__class__.__name__))

    def split(self, sep=None, maxsplit=sys.maxsize):
        """Behaves like the built-in str.split() except returns a
        WordList.

        :rtype: :class:`WordList <WordList>`
        """
        return WordList(self._strkey().split(sep, maxsplit))
Ejemplo n.º 7
0
class Blobber(object):
    """A factory for TextBlobs that all share the same tagger,
    tokenizer, parser, classifier, and np_extractor.

    Usage:

        >>> from textblob import Blobber
        >>> from textblob.taggers import NLTKTagger
        >>> from textblob.tokenizers import SentenceTokenizer
        >>> tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer())
        >>> blob1 = tb("This is one blob.")
        >>> blob2 = tb("This blob has the same tagger and tokenizer.")
        >>> blob1.pos_tagger is blob2.pos_tagger
        True

    :param tokenizer: (optional) A tokenizer instance. If ``None``,
        defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``,
        defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``,
        defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
    :param parser: A parser. If ``None``, defaults to
        :class:`PatternParser <textblob.en.parsers.PatternParser>`.
    :param classifier: A classifier.

    .. versionadded:: 0.4.0
    """

    np_extractor = FastNPExtractor()
    pos_tagger = NLTKTagger()
    tokenizer = WordTokenizer()
    analyzer = PatternAnalyzer()
    parser = PatternParser()

    def __init__(self,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None):
        _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                           parser, classifier)

    def __call__(self, text):
        """Return a new TextBlob object with this Blobber's ``np_extractor``,
        ``pos_tagger``, ``tokenizer``, ``analyzer``, and ``classifier``.

        :returns: A new :class:`TextBlob <TextBlob>`.
        """
        return TextBlob(text,
                        tokenizer=self.tokenizer,
                        pos_tagger=self.pos_tagger,
                        np_extractor=self.np_extractor,
                        analyzer=self.analyzer,
                        parser=self.parser,
                        classifier=self.classifier)

    def __repr__(self):
        classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None"
        return ("Blobber(tokenizer={0}(), pos_tagger={1}(), "
                    "np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\
                    .format(self.tokenizer.__class__.__name__,
                            self.pos_tagger.__class__.__name__,
                            self.np_extractor.__class__.__name__,
                            self.analyzer.__class__.__name__,
                            self.parser.__class__.__name__,
                            classifier_name)

    __str__ = __repr__
def filterAdj(nounScores, adjDict, filename):
    adjectList = list(adjDict.keys())
    nouns = []
    for key, value in nounScores.items():
        if value >= 3:
            nouns.append(key)
    nouns1 = [
        "sound quality", "battery life", "great phone", "cell phone",
        "menu option", "color screen", "flip phone", "samsung phone",
        "nokia phones", "corporate email", "ring tone", "tmobile service"
    ]

    nouns = set(nouns)

    stopWords = stopwords.words("english")
    exclude = set(string.punctuation)
    reviewTitle = []
    reviewContent = []

    with open(filename) as f:
        review = []
        for line in f:
            if line[:6] == "[+][t]":
                if review:
                    reviewContent.append(review)
                    review = []
                reviewTitle.append(line.split("[+][t]")[1].rstrip("\r\n"))
            elif line[:6] == "[-][t]":
                if review:
                    reviewContent.append(review)
                    review = []
                reviewTitle.append(line.split("[-][t]")[1].rstrip("\r\n"))
            else:
                if "##" in line:
                    x = line.split("##")
                    #if len(x[0]) != 0:
                    for i in xrange(1, len(x)):
                        review.append(x[i].rstrip("\r\n"))
                else:
                    continue
        reviewContent.append(review)

    #tb = Blobber(pos_tagger=PerceptronTagger())
    tb = Blobber(pos_tagger=NLTKTagger())
    nounScores = dict()
    f = open('modified.txt', 'w')
    for a in xrange(len(reviewContent)):
        f.write("[t]" + reviewTitle[a])
        f.write("\r\n")
        #Stores the score of the nouns
        for i in xrange(len(reviewContent[a])):
            text = reviewContent[a][i]
            x = tb(text).tags  #Perceptron tagger
            #Get the noun/adjective words and store it in tagList
            tagList = []
            e = 0
            f.write("##")

            while e < len(x):
                tagList = []
                f.write(x[e][0])
                e = e + 1
                count = e
                if (count < len(x) and x[count - 1][1] == "NN"
                        and x[count][1] == "NN"):
                    tagList.append(x[count - 1][0])

                    while (count < len(x) and x[count][1] == "NN"):
                        tagList.append(x[count][0])
                        count = count + 1
                if tagList != [] and len(tagList) == 2:
                    if set(tagList) <= nouns:

                        for t in range(1, len(tagList)):
                            f.write(tagList[t])
                        e = count
                f.write(" ")
            f.write(".\r\n")

    return adjectList
def findFeatures(reviewContent, filename):
    #nounScores is the dict containing nouns from all reviews and their respective scores from HAC algorithm
    nounScores = dict()

    #adjDict dict contains adjective and the corresponding noun which it is assigned to
    adjDict = dict()
    tb = Blobber(pos_tagger=NLTKTagger())

    for a in xrange(len(reviewContent)):  #Stores the score of the nouns
        for i in xrange(len(reviewContent[a])):
            text = ' '.join([
                word for word in reviewContent[a][i].split()
                if word not in stopwords.words("english")
            ])
            text = ''.join(ch for ch in text if ch not in exclude)
            text = nltk.word_tokenize(text)
            x = nltk.pos_tag(text)

            #Get the noun/adjective words and store it in tagList
            tagList = []
            for e in x:
                if (e[1] == "NN" or e[1] == "JJ"):
                    tagList.append(e)

            #Add the nouns(which are not in the nounScores dict) to the dict
            for e in tagList:
                if e[1] == "NN":
                    if e[0] not in nounScores:
                        nounScores[e[0]] = 0

            #For every adjective, find nearby noun
            for l in range(len(tagList)):
                if ("JJ" in tagList[l][1]):
                    j = k = leftHop = rightHop = -1

                    #Find the closest noun to the right of the adjective in the line
                    for j in range(l + 1, len(tagList)):
                        if (j == l + maxHops):
                            break
                        if ("NN" in tagList[j][1]):
                            rightHop = (j - l)
                            break

                    #Find the closest noun to the left of the adjective in the line
                    for k in range(l - 1, -1, -1):
                        #Incase hopped the 'maxHops' number of words and no noun was found, ignore the adjective
                        if (j == l - maxHops):
                            break
                        if ("NN" in tagList[k][1]):
                            leftHop = (l - k)
                            break

                    #Compare which noun is closer to adjective(left or right) and assign the adj to corresponding noun
                    if (leftHop > 0 and rightHop >
                            0):  #If nouns exist on both sides of adjective
                        if (leftHop - rightHop) >= 0:  #If left noun is farther
                            adjDict[tagList[l][0]] = tagList[j][0]
                            nounScores[tagList[j][0]] += 1
                        else:  #If right noun is farther
                            adjDict[tagList[l][0]] = tagList[k][0]
                            nounScores[tagList[k][0]] += 1
                    elif leftHop > 0:  #If noun is not found on RHS of adjective
                        adjDict[tagList[l][0]] = tagList[k][0]
                        nounScores[tagList[k][0]] += 1
                    elif rightHop > 0:  #If noun is not found on LHS of adjective
                        adjDict[tagList[l][0]] = tagList[j][0]
                        nounScores[tagList[j][0]] += 1

    nounScores = OrderedDict(
        sorted(nounScores.items(), key=operator.itemgetter(1)))
    return filterAdj(nounScores, adjDict, filename)
from textblob import Blobber
from textblob.taggers import NLTKTagger

# used to combine commonly used taggers, chunkers, etc to keep code DRY
tb = Blobber(pos_tagger = NLTKTagger())

blob = tb("This is amazing!")
another_blob = tb("This sucks!")

blob.pos_tagger is another_blob.pos_tagger
Ejemplo n.º 11
0
def getList():
	#reading from the created file "modified.txt"
	with open("modified.txt") as f:
		review = []
		for line in f:
			if line[:3] == "[t]":
				if review:
					reviewContent.append(review)
					review = []
				reviewTitle.append(line.split("[t]")[1].rstrip("\r\n"))
			else:
				if "##" in line:
					x = line.split("##")
					for i in range(1, len(x)):
						review.append(x[i].rstrip("\r\n"))
				else:
					continue
		reviewContent.append(review)

	tb = Blobber(pos_tagger=NLTKTagger())
	nounScores = dict()
	for a in range(len(reviewContent)):								#Stores the score of the nouns
		for i in range(len(reviewContent[a])):
			#text = reviewContent[a][i]
			text = ' '.join([word for word in reviewContent[a][i].split() if word not in stopwords.words("english")])
			text = ''.join(ch for ch in text if ch not in exclude)
			text = nltk.word_tokenize(text)
			x = nltk.pos_tag(text)
			#x = TextBlob(text).tags #textblob tagger
			#x = tb(text).tags #Perceptron tagger 
			#Get the noun/adjective words and store it in tagList
			tagList = []
			for e in x:
				if(e[1] == "NN" or e[1] == "JJ"):
					tagList.append(e)
	
			#Add the nouns(which are not in the nounScores dict) to the dict
			for e in tagList:
				if e[1] == "NN":
					if e[0] not in nounScores:
						nounScores[e[0]] = 0

			#For every adjective, find nearby noun
			l=0
			for l in range(len(tagList)):
				if(tagList[l][1] == "JJ"):
					check=0
					j = 0
					k = 0
					ct1 = 0
					for j in range(l + 1, len(tagList)):
						if ct1 == 4:
							break
						if(tagList[j][1] == "NN"):
							#nounScores[tagList[j][0]] += 1
							check = 1
							break
					ct = 0		
					if(l > 0):
						if j == 0:
							j = len(tagList)
						for k in range(l - 1, 0, -1):
							if ct == 4:
								break
							ct += 1
							if(tagList[k][1] == "NN"):
								if(j != len(tagList)):
									nounScores[tagList[min(j, k)][0]] += 1
								else:
									nounScores[tagList[k][0]] += 1	
								break
					elif check == 1:
						nounScores[tagList[j][0]] += 1
	
	nounScores = OrderedDict(sorted(nounScores.items(), key=operator.itemgetter(1)))
	nouns = []
	for key, value in nounScores.items():
		if value >= 3:
			nouns.append(key)
	return nouns
Ejemplo n.º 12
0
def makeTextBlob(txt):
    """Wrapper for TextBlob Call"""
    return TextBlob(txt, pos_tagger=NLTKTagger())
Ejemplo n.º 13
0
def getList():
    """

    :rtype: object
    """
    # reading from the created file "modified.txt"
    adjNounAll = dict()
    adjNounList = dict()
    with open("modified.txt") as f:
        review = []
        for line in f:
            if line[:3] == "[t]":
                if review:
                    reviewContent.append(review)
                    review = []
            else:
                if "##" in line:
                    x = line.split("##")
                    for i in range(1, len(x)):
                        review.append(x[i].rstrip("\r\n"))
                else:
                    continue
        reviewContent.append(review)

    tb = Blobber(pos_tagger=NLTKTagger())
    nounScores = dict()
    for a in range(len(reviewContent)):
        for i in range(len(reviewContent[a])):
            text = ' '.join([
                word for word in reviewContent[a][i].split()
                if word not in stopwords.words("english")
            ])
            text = ''.join(ch for ch in text if ch not in exclude)
            text = nltk.word_tokenize(text)
            x = nltk.pos_tag(text)
            tagList = []
            for e in x:
                if (e[1] == "NN" or e[1] == "JJ"):
                    tagList.append(e)

            # Add the nouns(which are not in the nounScores dict) to the dict
            for e in tagList:
                if e[1] == "NN":
                    if e[0] not in nounScores:
                        nounScores[e[0]] = 0

            # For every adjective, find nearby noun l=0
            for l in range(len(tagList)):
                if (tagList[l][1] == "JJ"):
                    check = 0
                    j = 0
                    k = 0
                    for j in range(l + 1, len(tagList)):
                        if (tagList[j][1] == "NN"):
                            check = 1
                            break
                    ct = 0
                    if (l > 0):
                        if j == 0:
                            j = len(tagList)
                        for k in range(l - 1, 0, -1):
                            if ct == 4:
                                break
                            ct += 1
                            if (tagList[k][1] == "NN"):
                                if (j != len(tagList)):
                                    nounScores[tagList[min(j, k)][0]] += 1
                                    adjNounAll[tagList[min(
                                        j, k)][0]] = tagList[l][0]
                                else:
                                    nounScores[tagList[k][0]] += 1
                                    adjNounAll[tagList[k][0]] = tagList[l][0]
                                break
                    elif check == 1:
                        nounScores[tagList[j][0]] += 1
                        adjNounAll[tagList[j][0]] = tagList[l][0]

    nounScores = OrderedDict(
        sorted(nounScores.items(), key=operator.itemgetter(1)))
    nouns = []
    for key, value in nounScores.items():
        if value >= 3:
            nouns.append(key)
            adjNounList[key] = adjNounAll[key]
    return [nouns, adjNounList]
Ejemplo n.º 14
0
    #tokenize  the sentence into words
    for j in word_tokenize(i):
        temp.append((j.lower())) # what does this do?

    data.append(temp)

# Create CBOW model
model = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5)

X = model[model.wv.vocab]
words = list(model.wv.vocab)

#*******************************#

# Naive noun plotting
nltkTagger = NLTKTagger()
blob = TextBlob(s, pos_tagger=nltkTagger)
allTags = blob.pos_tags

nouns = []

# remove non nouns
for n in allTags:
    if str(n[1]) == 'NN':
        nouns.append(n)
# print('NOUN LIST: ', nouns)

# Remove duplicates
nounFrequencies = {}
for n in nouns:
    if n[0] in nounFrequencies: