Example #1
0
    def run(self, data):
        terms = {}
        results = []
        try:
            for corpus in data:
                tokens = corpus.tokenized_contents
                for token in tokens:
                    if token in terms:
                        terms[token] += 1
                    else:
                        terms[token] = 1
            for term in terms:
                results.append({"term": term, "frequency": terms[term]})

            #sort results by term frequency
            results.sort(key=lambda results: results['frequency'],
                         reverse=True)

            return {"entities": [], "sentences": results}

        except LookupError:
            raise TransactionException('NLTK \'Punkt\' Model not installed.',
                                       500)
        except TypeError:
            raise TransactionException('Corpus contents does not exist.')
Example #2
0
 def parse_json(self, json_data):
     try:
         input_data = json.loads(json_data.decode())
         print(input_data)
         self.transaction_id = input_data['transaction_id']
         self.operation = input_data['operation']
         self.library = input_data['library']
         if 'user_id' in input_data.keys():
             self.user_id = input_data['user_id']
         if 'cleanup' in input_data.keys():
             self.cleanups = input_data['cleanup']
         self.corpora_ids = input_data['corpora_ids']
         if 'tokenizer' in input_data.keys():
             self.tokenizer = input_data['tokenizer']
     except KeyError:
         raise TransactionException(
             'Missing property transaction_id, operation, library, tokenizer or corpora_ids.'
         )
     except ValueError:
         raise TransactionException('Could not parse JSON.')
     try:
         #load corpora from database
         corpora = DatabaseAdapter.getDB().corpus
         for id in self.corpora_ids:
             corpus = corpora.find_one({"_id": ObjectId(id)})
             self.corpora.append(
                 Corpus(id, corpus["title"], corpus["contents"],
                        corpus["tags"]))
     except (TypeError, InvalidId):
         raise TransactionException('Could not find corpus.')
Example #3
0
    def run(self, data):
        results = []
        first = {}
        second = {}
        third = {}
        try:
            for corpus in data:
                temp_corpus = " ".join(re.split(r'\s|\n', corpus.contents))
                temp_bubble = SPLAT(temp_corpus.rstrip("\n"))
                pronouns = temp_bubble.raw_pronouns()
                print(pronouns)
                sents = temp_bubble.sents()

            for p, v in pronouns.items():
                if v[1] == "1st-Person":
                    first[p] = v
                elif v[1] == "2nd-Person":
                    second[p] = v
                elif v[1] == "3rd-Person":
                    third[p] = v

            results.append({
                'corpus_id': corpus.id,
                'first-person': first,
                'second-person': second,
                'third-person': third,
                'sentences': sents
            })

            results = json.dumps(results)
            print(results)
            return results
        except TypeError as e:
            print(e)
            raise TransactionException("Failed to run SplatPronouns.")
Example #4
0
    def run(self, data):
        results = []
        pos_parsed = {}
        try:
            for corpus in data:
                temp_bubble = SPLAT(corpus.contents)
                pos_tags = temp_bubble.pos()
                pos_counts = temp_bubble.pos_counts()
                for k, v in pos_tags:
                    if v in pos_parsed.keys():
                        if k not in pos_parsed[v]:
                            pos_parsed[v].append(k)
                    else:
                        pos_parsed[v] = []
                        pos_parsed[v].append(k)

                results.append({
                    'corpus_id': corpus.id,
                    'pos_tags': pos_parsed,
                    'pos_counts': pos_counts
                })

            results = json.dumps(results)
            print(results)
            return results
        except TypeError as e:
            print(e)
            raise TransactionException('Failed to run SplatPOSFrequencies.')
Example #5
0
 def run(self, data):
     self.num_docs = len(data)
     try:
         results = []
         for corpus in data:
             terms_in_doc = {}
             tokens = corpus.tokenized_contents
             for word in tokens:
                 if word in terms_in_doc:
                     terms_in_doc[word] += 1
                 else:
                     terms_in_doc[word] = 1
             for (word, freq) in terms_in_doc.items():
                 #If the word appears in a doc, increment the # of docs containing the word by 1
                 if word in self.global_term_freq:
                     self.global_term_freq[word] += 1
                 else:
                     self.global_term_freq[word] = 1
             self.global_terms_in_doc[corpus.id] = terms_in_doc
         for corpus in data:
             max_freq = 0
             for (term,
                  freq) in self.global_terms_in_doc[corpus.id].items():
                 if freq > max_freq:
                     max_freq = freq
             for (term,
                  freq) in self.global_terms_in_doc[corpus.id].items():
                 idf = math.log(
                     float(self.num_docs) /
                     float(1 + self.global_term_freq[term]))
                 tfidf = float(freq) / float(max_freq) * float(idf)
                 results.append({
                     'corpus_id': corpus.id,
                     'term': term,
                     'importance': tfidf
                 })
         return results
     except LookupError:
         raise TransactionException('NLTK \'Punkt\' Model not installed.',
                                    500)
     except TypeError:
         raise TransactionException('Corpus contents does not exist.')
 def read_corpora(self, corpora_ids):
     try:
         #load corpora from database
         corpora = DatabaseAdapter.getDB().corpus
         for id in self.corpora_ids:
             corpus = corpora.find_one({"_id": ObjectId(id)})
             self.corpora.append(
                 Corpus(id, corpus["title"], corpus["contents"],
                        corpus["tags"]))
     except (TypeError, InvalidId):
         raise TransactionException('Could not find corpus.')
    def parse_json(self, json_data):
        try:
            input_data = json.loads(json_data.decode())

            self.transaction_id = input_data['transaction_id']
            self.operation = input_data['operation']
            self.library = input_data['library']
            self.analysis_name = input_data['analysis_name']
            self.time_created = input_data['time_created']

            if 'user_id' in input_data.keys():
                self.user_id = input_data['user_id']
            if 'cleanup' in input_data.keys():
                self.cleanups = input_data['cleanup']
            self.corpora_ids = input_data['corpora_ids']
            if 'tokenizer' in input_data.keys():
                self.tokenizer = input_data['tokenizer']

        except KeyError:
            raise TransactionException(
                'Missing property transaction_id, operation, library, tokenizer or corpora_ids.'
            )
        except ValueError:
            raise TransactionException('Could not parse JSON.')
Example #8
0
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'pos_tag':
        return PosTag()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stem_lancaster':
        return StemmerLancaster()
    elif operation == 'stem_snowball':
        return StemmerSnowball()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'topic_model':
        return TopicModel()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'word_tokenize_spaces':
        return WordTokenizeSpaces()
    elif operation == 'word_tokenize_tabs':
        return WordTokenizeTabs()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP(['pos'])
    elif operation == 'nlp-ner':
        return StanfordCoreNLP(['pos', 'ner'])
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException("The requested operation does not exist.")
Example #9
0
 def run(self, data):
     results = []
     try:
         for corpus in data:
             split_string = corpus.contents.split(" ")
             temp_corpus = list(filter("{SL}".__ne__, split_string))
             temp_corpus = list(filter("{sl}".__ne__, temp_corpus))
             temp_corpus_contents = " ".join(temp_corpus)
             # print(corpus.contents)
             temp_bubble = SPLAT(temp_corpus_contents)
             temp_trees = TreeStringParser().get_parse_trees(
                 temp_bubble.sents())
             # print(temp_bubble.splat())
             # cdensity = temp_bubble.content_density()
             cdensity = cUtil.calc_content_density(temp_trees)
             print(cdensity)
             # print(temp_bubble.treestrings())
             # idensity = temp_bubble.idea_density()
             idensity = cUtil.calc_idea_density(temp_trees)[0]
             # print(idensity)
             flesch_score = temp_bubble.flesch_readability()
             # print(flesch_score)
             kincaid_score = temp_bubble.kincaid_grade_level()
             # print(kincaid_score)
             types = len(temp_bubble.types())
             tokens = len(temp_bubble.tokens())
             type_token_ratio = float(float(types) / float(tokens))
             results.append({
                 'corpus_id': corpus.id,
                 'content_density': cdensity,
                 'idea_density': idensity,
                 'flesch_score': flesch_score,
                 'kincaid_score': kincaid_score,
                 'types': types,
                 'tokens': tokens,
                 'type_token_ratio': type_token_ratio
             })
         results = json.dumps(results)
         # print(results)
         return results
     except TypeError as e:
         print(e)
         raise TransactionException('Corpus contents does not exist.')
Example #10
0
    def run(self, data):
        results = []
        syllables_parsed = {}
        try:
            for corpus in data:
                # temp_bubble = SPLAT(corpus.contents)
                split_string = re.split(r'\s|\n', corpus.contents)
                temp_corpus = list(filter("{SL}".__ne__, split_string))
                temp_corpus = list(filter("{sl}".__ne__, temp_corpus))
                temp_corpus_contents = " ".join(temp_corpus)
                temp_bubble = SPLAT(temp_corpus_contents.rstrip('\n'))
                temp_tokens = temp_bubble.tokens()
                temp_tokens = ' '.join(temp_tokens).strip("\n").split(' ')
                print(temp_tokens)
                for tok in temp_tokens:
                    temp_tok = tok.strip("\n")
                    temp_syll_count = cUtil.num_syllables([temp_tok])
                    if temp_syll_count == 0:
                        temp_syll_count = 1
                    if str(temp_syll_count) in syllables_parsed.keys():
                        if tok not in syllables_parsed[str(temp_syll_count)]:
                            syllables_parsed[str(temp_syll_count)].append(
                                temp_tok)
                    else:
                        syllables_parsed[str(temp_syll_count)] = []
                        syllables_parsed[str(temp_syll_count)].append(temp_tok)

                print("Creating results...")
                results.append({
                    'corpus_id': corpus.id,
                    'syllables': syllables_parsed
                })

            results = json.dumps(results)
            print(results)
            return results
        except TypeError as e:
            print(e)
            raise TransactionException('Failed to run SplatSyllables.')
Example #11
0
    def run(self, data):
        results = []
        try:
            for corpus in data:
                temp_bubble = SPLAT(corpus.contents)
                # Gather Unigram Frequencies
                temp_unigrams = temp_bubble.unigrams()
                unigrams = dict()
                for item in temp_unigrams:
                    unigrams[item[0]] = unigrams.get(item[0], 0) + 1

                # Gather Bigram Frequencies
                temp_bigrams = temp_bubble.bigrams()
                bigrams = dict()
                for item in temp_bigrams:
                    parsed_item = ' '.join(item)
                    bigrams[parsed_item] = bigrams.get(parsed_item, 0) + 1

                # Gather Trigram Frequencies
                temp_trigrams = temp_bubble.trigrams()
                trigrams = dict()
                for item in temp_trigrams:
                    parsed_item = ' '.join(item)
                    trigrams[parsed_item] = trigrams.get(parsed_item, 0) + 1

                results.append({
                    'corpus_id': corpus.id,
                    'unigrams': unigrams,
                    'bigrams': bigrams,
                    'trigrams': trigrams
                })
            results = json.dumps(results)
            # print(results)
            return results
        except TypeError:
            raise TransactionException('Corpus contents does not exist.')
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'removesilence':
        return RemoveSilence()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'removehashtags':
        return RemoveHashtags()
    elif operation == 'removequotes':
        return RemoveQuotes()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stop_words':
        return RemoveStopwords()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP('pos')
    elif operation == 'nlp-ner':
        return StanfordCoreNLP('ner')
    elif operation == 'nlp-sentiment':
        return StanfordCoreNLP('sentiment')
    elif operation == 'nlp-coref':
        return StanfordCoreNLP('coref')
    elif operation == 'nlp-relation':
        return StanfordCoreNLP('relation')
    elif operation == 'splat-disfluency':
        print("YOU GOT SPLATTED")
        return SplatDisfluency()
    elif operation == 'splat-ngrams':
        print("YOU GOT SPLATTED")
        return SplatNGrams()
    elif operation == 'splat-complexity':
        print("YOU GOT SPLATTED")
        return SplatComplexity()
    elif operation == 'splat-pos':
        print("YOU GOT SPLATTED")
        return SplatPOSFrequencies()
    elif operation == 'splat-syllables':
        print("YOU GOT SPLATTED")
        return SplatSyllables()
    elif operation == 'splat-pronouns':
        print("YOU GOT SPLATTED")
        return SplatPronouns()
    elif operation == 'char-ngrams':
        return CharNgrams()
    elif operation == 'length-stats':
        return LengthStatistics()
    elif operation == 'topic-model-10':
        return TopicModel(10)
    elif operation == 'topic-model-30':
        return TopicModel(30)
    elif operation == 'word-vector':
        return WordVector()
    elif operation == 'unsup-morph':
        return UnsupervisedMorphology()
    elif operation == 'bigram-array':
        return BigramArray()
    elif operation == 'speech-token-stats':
        return SpeechTokenStatistics()
    elif operation == 'extract_transcript':
        return ExtractTranscript()
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException(
            f'The requested operation "{operation}" does not exist.')
Example #13
0
    def run(self, data):
        results = []
        try:
            for corpus in data:
                temp_bubble = SPLAT(corpus.contents)
                print(corpus.contents)
                print(temp_bubble.sents())
                raw_disfluencies = Util.count_disfluencies(temp_bubble.sents())
                print(raw_disfluencies)
                sentences = {}
                average_disfluencies = 0
                um_count, uh_count, ah_count, er_count, hm_count, sl_count, rep_count, brk_count = (
                    0, ) * 8
                # Sort the data so it looks better in JSON
                for i in raw_disfluencies[0]:
                    temp_dis = {
                        "UM": raw_disfluencies[0][i][0],
                        "UH": raw_disfluencies[0][i][1],
                        "AH": raw_disfluencies[0][i][2],
                        "ER": raw_disfluencies[0][i][3],
                        "HM": raw_disfluencies[0][i][4],
                        "SILENT PAUSE": raw_disfluencies[0][i][5],
                        "REPETITION": raw_disfluencies[0][i][6],
                        "BREAK": raw_disfluencies[0][i][7]
                    }
                    sentences[i] = temp_dis
                    for (k, v) in temp_dis.items():
                        # Gather total disfluencies for each disfluency type
                        average_disfluencies += v
                        if k == "UM":
                            um_count += v
                        elif k == "UH":
                            uh_count += v
                        elif k == "AH":
                            ah_count += v
                        elif k == "ER":
                            er_count += v
                        elif k == "HM":
                            hm_count += v
                        elif k == "SILENT PAUSE":
                            sl_count += v
                        elif k == "REPETITION":
                            rep_count += v
                        elif k == "BREAK":
                            brk_count += v

                temp_total = average_disfluencies

                # Calculate the average disfluencies per sentence in the whole text
                average_disfluencies = float(average_disfluencies /
                                             len(raw_disfluencies[0]))

                total_disfluencies = {
                    "UM": um_count,
                    "UH": uh_count,
                    "AH": ah_count,
                    "ER": er_count,
                    "HM": hm_count,
                    "SILENT PAUSE": sl_count,
                    "REPETITION": rep_count,
                    "BREAK": brk_count,
                    "TOTAL": temp_total
                }

                results.append({
                    'corpus_id': corpus.id,
                    'sentences': sentences,
                    'average_disfluencies_per_sentence': average_disfluencies,
                    'total_disfluencies': total_disfluencies
                })
            results = json.dumps(results)
            print(results)
            return results
        except TypeError:
            raise TransactionException('Corpus contents does not exist.')
Example #14
0
def get_operation_handler(operation):
    if operation == 'lemmatize_wordnet':
        return LemmatizerWordNet()
    elif operation == 'pos_tag':
        return PosTag()
    elif operation == 'removecapsgreedy':
        return RemoveCapsGreedy()
    elif operation == 'removecapsnnp':
        return RemoveCapsPreserveNNP()
    elif operation == 'removepunct':
        return RemovePunct()
    elif operation == 'removesilence':
        return RemoveSilence()
    elif operation == 'remove_stopwords':
        return RemoveStopwords()
    elif operation == 'sentence_tokenize':
        return SentenceTokenize()
    elif operation == 'stem_porter':
        return StemmerPorter()
    elif operation == 'stop_words':
        return RemoveStopwords()
    elif operation == 'tfidf':
        return Tfidf()
    elif operation == 'topic_model':
        return TopicModel()
    elif operation == 'wordcloudop':
        return WordCloudOp()
    elif operation == 'word_tokenize_treebank':
        return WordTokenizeTreebank()
    elif operation == 'word_tokenize_whitespace_punct':
        return WordTokenizeWhitespacePunct()
    elif operation == 'word_tokenize_stanford':
        return WordTokenizeStanford()
    elif operation == 'word_tokenize_spaces':
        return WordTokenizeSpaces()
    elif operation == 'word_tokenize_tabs':
        return WordTokenizeTabs()
    elif operation == 'nlp-pos':
        return StanfordCoreNLP(['pos'])
    elif operation == 'nlp-ner':
        return StanfordCoreNLP(['pos', 'ner'])
    elif operation == 'nlp-sentiment':
        return StanfordCoreNLP(['parse', 'sentiment'])
    elif operation == 'nlp-parse':
        return StanfordCoreNLP(['parse'])
    elif operation == 'nlp-coref':
        return StanfordCoreNLP(['tokenize', 'ssplit', 'coref'])
    elif operation == 'nlp-relation':
        return StanfordCoreNLP(['parse', 'relation'])
    elif operation == 'splat-disfluency':
        print("YOU GOT SPLATTED")
        return SplatDisfluency()
    elif operation == 'splat-ngrams':
        print("YOU GOT SPLATTED")
        return SplatNGrams()
    elif operation == 'splat-complexity':
        print("YOU GOT SPLATTED")
        return SplatComplexity()
    elif operation == 'splat-pos':
        print("YOU GOT SPLATTED")
        return SplatPOSFrequencies()
    elif operation == 'splat-syllables':
        print("YOU GOT SPLATTED")
        return SplatSyllables()
    elif operation == 'splat-pronouns':
        print("YOU GOT SPLATTED")
        return SplatPronouns()
    elif operation == 'noop':
        return NoOp()
    else:
        raise TransactionException("The requested operation does not exist.")