Ejemplo n.º 1
0
class WordGrapher(object):

    doc = ""
    blob = None
    docs = []

    bigrams = None
    trigrams = None

    tokens = None
    tokenizer = None

    tfidf = None

    stopwords = []
    try:
        stopwords.extend(nltk.corpus.stopwords.words('indonesian'))
        stopwords.extend(nltk.corpus.stopwords.words('english'))
    except IOError:
        pass

    def __init__(self, doc=None, docs=None):
        self.tokenizer = StopwordsTokenizer(stopwords=self.stopwords)

        if doc:
            self.set_document(doc=doc)

        if docs:
            self.set_documents(docs=docs)

    def set_document(self, doc, docs_list_mode=False):
        if doc:
            return self.initialize_document(doc=doc, docs_list_mode=docs_list_mode)
        else:
            raise ValueError("Document must not be None or empty")

    def set_documents(self, docs):
        if docs and isinstance(docs, list) and len(docs) > 0:
            self.docs = [self.set_document(doc=doc, docs_list_mode=True) for doc in docs]
        else:
            raise ValueError("Documents must not be None or and empty List")

    def initialize_document(self, doc, docs_list_mode=False):
        if not docs_list_mode:
            self.doc = doc.lower()

            self.blob = TextBlob(text=self.doc, tokenizer=self.tokenizer)
            self.tokens = copy.deepcopy(self.blob.tokens)

            self.bigrams = self.bigramify(self.blob.tokens)
            self.tokens.extend(self.bigrams)

            self.trigrams = self.trigramify(self.blob.tokens)
            self.tokens.extend(self.trigrams)
        else:
            doc = doc.lower()

            blob = TextBlob(text=doc, tokenizer=self.tokenizer)
            tokens = copy.deepcopy(blob.tokens)

            bigram = self.bigramify(tokens=tokens)
            tokens.extend(bigram)

            trigram = self.trigramify(tokens=tokens)
            tokens.extend(trigram)

            return tokens

    def bigramify(self, tokens, as_string=True):
        if as_string:
            return ["%s %s" % (words[0], words[1]) for words in bigrams(tokens)]
        else:
            return bigrams(tokens)

    def trigramify(self, tokens, as_string=True):
        if as_string:
            return ["%s %s %s" % (words[0], words[1], words[2]) for words in trigrams(tokens)]
        else:
            return trigrams(tokens)

    def ngrams(self, n):
        return self.blob.ngrams(n=n)

    def freq(self, word, docs=None):
        if docs is None:
            return self.tokens.count(word)
        else:
            if not isinstance(docs, str):
                d = ""
                for item in docs:
                    d = "%s %s" % (d, item)
                docs = d

            blob = TextBlob(text=docs, tokenizer=self.tokenizer)
            blob.tokens.extend(self.bigramify(blob))
            blob.tokens.extend(self.trigramify(blob))
            return blob.tokens.count(word)

    def tf(self, word):
        return self.freq(word=word) / float(self.doc_word_count())

    def doc_word_count(self):
        return len(self.tokens)

    def num_docs_containing(self, word):
        if self.docs is None:
            return 2
        else:
            count = 0
            for document in self.docs:
                if self.freq(word=word, docs=document) > 0:
                    count += 1
            return 1 + count

    def idf(self, word):
        if self.docs is None:
            docs_length = 1
        else:
            docs_length = len(self.docs)

        num_docs = self.num_docs_containing(word)
        return math.log(docs_length / float(num_docs))

    def tf_idf(self, word):
        return self.tf(word) * self.idf(word)

    def analyze(self, count=10, percentage=False):
        if not self.doc or not self.docs:
            raise ValueError("Document and its Documents Set must not be None or empty")

        score = {
            'freq': {},
            'tf': {},
            'idf': {},
            'tf-idf': {}
        }

        for token in self.tokens:
            score['freq'][token] = self.freq(token)
            score['tf'][token] = self.tf(token)
            score['idf'][token] = self.idf(token)
            score['tf-idf'][token] = math.fabs(self.tf_idf(token))

        final = {}
        for token in score['tf-idf']:
            if token not in final:
                final[token] = score['tf-idf'][token]
            else:
                if score['tf-idf'][token] > final[token]:
                    final[token] = score['tf-idf'][token]

        if not percentage:
            self.tfidf = [item for item in sorted(final.items(), key=lambda x: x[1], reverse=True)[:count]]
            return self.tfidf
        else:
            result = [item for item in sorted(final.items(), key=lambda x: x[1], reverse=True)[:count]]
            max = 0.0
            for item in result:
                if item[1] > max:
                    max = item[1]
            self.tfidf = [(item[0], "%.2f%%" % (item[1]/max*100)) for item in result]
            return self.tfidf

    def graph(self, word):
        return self.graph_doc(word=word)

    def graph_doc(self, word):
        if not self.tfidf:
            raise ValueError("Please call analyze first before creating a graph")

        result = {}
        tris = self.trigramify(tokens=self.blob.tokens, as_string=False)

        matches = ["%s %s %s" % (tri[0], tri[1], tri[2]) for tri in tris if word in tri[1]]
        result['tris'] = [item for item in self.tfidf if item[0] in matches]

        bis = self.bigramify(tokens=self.blob.tokens, as_string=False)
        matches = ["%s %s" % (bi[0], bi[1]) for bi in bis if word in bi[0] or word in bi[1]]
        result['twos'] = [item for item in self.tfidf if item[0] in matches]

        return result
Ejemplo n.º 2
0
text = TextBlob(
    "One time there was this pycheat in github. And it was Github where all the cheats were"
)
text.word_counts['github']

#Translation
text = TextBlob("Frankly!Can I get a cup of coffe con leche in plaza Mayor?")
text.detect_language()
text.translate(to="es")

#Parsing

text.parse()

#Fetch words
text[0:7]

#Capital letters
text.upper()
text.lower()

#Find
text.find("leche")

#Get succesive words

text.ngrams(n=3)

#Get in json format

text.json()
Ejemplo n.º 3
0
#Word counts
text = TextBlob("One time there was this pycheat in github. And it was Github where all the cheats were")
text.word_counts['github']

#Translation
text = TextBlob("Frankly!Can I get a cup of coffe con leche in plaza Mayor?")
text.detect_language()
text.translate( to="es")

#Parsing

text.parse()

#Fetch words
text[0:7]

#Capital letters
text.upper()
text.lower()

#Find
text.find("leche")

#Get succesive words

text.ngrams(n=3)

#Get in json format

text.json()