def setUp(self): ''' Metodo di setup per lo unittesting. Vengono create le istanze necessarie al test. Le azioni all'interno di questo metodo vengono eseguite prima dell'esecuzione del test. ''' self.a1 = TextAnalyzer(0) self.a2 = TextAnalyzer(1) self.r1, self.r2 = self.a1.get_results(), self.a2.get_results()
def predict(): response = {"success": False, "Content-Type": "application/json"} if flask.request.get_json().get("text"): text = flask.request.get_json().get("text")[0] ta = TextAnalyzer() ma = ta.morphological_analize(text) text = " ".join(ma[0]) print(text) feature = vectorizer.transform([text]).toarray() response["prediction"] = clf.predict(feature).tolist() print(response) response["success"] = True return flask.jsonify(response)
class OpEdManager: def __init__(self): self.html_fetcher = HtmlFetcher() self.text_analyzer = TextAnalyzer() self.oped_store = OpEdStore() def process(self): oped_categories = self.oped_store.get_categories() parser = RssFeedFetcher() toi_oped_parser = ToiOpinionFetcher() for oped_category in tqdm(oped_categories, total=len(oped_categories)): try: if (oped_category.id == 1): articles = toi_oped_parser.get_feeds( 'https://timesofindia.indiatimes.com/blogs/' ) # noqa: E501 self.process_articles(oped_category, articles) else: articles = parser.get_feeds(oped_category.link) self.process_articles(oped_category, articles) except Exception as e: print("Encountered exception while processing op ed feeds.") print(e) print(oped_category) def process_articles(self, oped_category, articles): for article in articles: try: paragraphs = self.html_fetcher.get_paragraphs(article.link) text = ' '.join(paragraphs) article.keywords = self.text_analyzer.get_keywords(text) article.summary = self.text_analyzer.get_summary(text) except Exception as e: print( "Encountered exception while fetching and processing article." ) # noqa: E501 print(e) print(article) self.oped_store.add_articles(oped_category, articles)
def get_reviews(link: str): asin = link[link.rindex('/') + 1:] api_key = CONFIG.KEY_AMAZON_API params = { 'api_key': api_key, 'type': 'reviews', 'amazon_domain': 'amazon.it', 'asin': asin, 'page': '3' } api_result = requests.get('https://api.rainforestapi.com/request', params) jsonStringResult = json.dumps(api_result.json()) jsonResult = json.loads(jsonStringResult) reviews = jsonResult["reviews"] list_of_body = [] for review in reviews: list_of_body.append(review["body"]) sum = 0 for review in reviews: sum += float(review["rating"]) if len(reviews) > 0: mean = sum / len(reviews) else: mean = 0 text_analyzer = TextAnalyzer() if len(list_of_body) > 0: results = text_analyzer.sentiment_analysis(list_of_body) else: results = None return (results, mean)
class TestOccurences(unittest.TestCase): ''' Questa classe eredita da unittest.TestCase i metodi di default per lo unittesting in Python. ''' def setUp(self): ''' Metodo di setup per lo unittesting. Vengono create le istanze necessarie al test. Le azioni all'interno di questo metodo vengono eseguite prima dell'esecuzione del test. ''' self.a1 = TextAnalyzer(0) self.a2 = TextAnalyzer(1) self.r1, self.r2 = self.a1.get_results(), self.a2.get_results() def test_final_results(self): ''' Questo metodo verifica che i risultati prodotti da due analizzatore distinti sianoi medesimi. Nello specifico, solleva una assertEqual nel caso in cui i risultati coincidano. Dunque il test è passato. Nel caso in cui il test non sia stato passato, viene sollevato un messaggio con relativo errore. ''' self.assertEqual(self.r1, self.r2, "chars_occurrence_test: test non apssato.") def tearDown(self): ''' In questo metodo sono contenute tutte le azioni da compiere dopo che il test è avvenuto, e i risultati sono stati prodotti. In particolare, vengono cancellati i file di prova creati per il test. ''' try: os.remove("splitted_file_0.txt") os.remove("splitted_file_1.txt") except Exception as e: pass
from telegram import ReplyKeyboardMarkup from telegram.ext import Updater, MessageHandler, Filters from telegram.ext import CommandHandler from text_analyzer import TextAnalyzer from parse import get_links_from_pages, get_all_text TOKEN = '1861188889:AAGu61Kgfh6CNbFuqKP6o4o38tLerG4lngM' print('Происходит парсинг, подождите') # name, depth = 'Python', 1 name, depth = 'Задача', 2 res = get_links_from_pages(name, depth) text = get_all_text(res) analyzer = TextAnalyzer(text) def text_handler(update, context): update.message.reply_text(f'Я реагирую только на команды') def start(update, context): update.message.reply_text( "Привет! Нажми /help, чтобы узнать мои команды и возможности.", reply_markup=ReplyKeyboardMarkup( [['/help', '/stop_words', '/describe']], resize_keyboard=True)) def help(update, context): update.message.reply_text("""
arg_parser.add_argument('--port', type=int, nargs='?', default=9010, help='The port to run tristan on.') return arg_parser.parse_args().port if __name__ == '__main__': logging.basicConfig(level=logging.INFO) port = gather_args() text_analyzer = TextAnalyzer() reddit_util = RedditUtil() @post('/tristan') def search(): search_json = request.json if search_json: if 'queries' in search_json.keys(): results = [] subreddit_cache = {}
import numpy as np from sklearn.datasets import fetch_20newsgroups from constants import categories from text_analyzer import TextAnalyzer if __name__ == '__main__': train_news_groups = fetch_20newsgroups(subset='train', categories=categories) print('Train data_groups loaded') test_news_groups = fetch_20newsgroups(subset='test', categories=categories) print('Test data_groups loaded') text_analyzer = TextAnalyzer((train_news_groups, test_news_groups)) text_analyzer.load_model('models/standard_model/model.ckpt') texts, correct_categories_codes = text_analyzer.service.get_batch( test_news_groups, 0, 20) predicted_categories = text_analyzer.get_prediction(texts) correct_categories = np.argmax(correct_categories_codes, 1) correct_predictions_count = 0 incorrect_predictions_count = 0 for index in range(20): if correct_categories[index] == predicted_categories[index]: correct_predictions_count += 1 else: incorrect_predictions_count += 1 print(
from text_analyzer import TextAnalyzer if __name__ == '__main__': # TODO: finish me ta = TextAnalyzer('data/emma.txt') print(ta.sentences[:10])
import os from text_analyzer import TextAnalyzer def title_case(string): return string.replace('_', ' ').title() for author in os.listdir('text'): texts = os.listdir("text/{}".format(author)) for book in texts: text = open("text/{}/{}".format(author, book)).read() title = title_case(book.replace('.txt', '')) csv = TextAnalyzer.analyze(text, title, title_case(author)) print(csv)
def __init__(self): self.html_fetcher = HtmlFetcher() self.text_analyzer = TextAnalyzer() self.oped_store = OpEdStore()
def create_corpus(df): ta = TextAnalyzer() ma = df["text"].apply(lambda x: ta.morphological_analize(x)) corpus = ma.apply(lambda x: " ".join(x[0])) return corpus
from sklearn.datasets import fetch_20newsgroups from constants import categories from text_analyzer import TextAnalyzer if __name__ == '__main__': train_news_groups = fetch_20newsgroups(subset='train', categories=categories) print('Train data_groups loaded') test_news_groups = fetch_20newsgroups(subset='test', categories=categories) print('Test data_groups loaded') text_analyzer = TextAnalyzer((train_news_groups, test_news_groups)) text_analyzer.train_model(train_news_groups, test_data_groups=test_news_groups, path='models/test/model')
#rangeCount = 10 #rangeAmount = wordLength // rangeCount # for x in range(0, rangeCount): # start = x * rangeAmount # end = start + rangeAmount # # txt = ' '.join(splitWords[start:end]) # print("Words: " + str(start) + " - " + str(end)) # datas[x] = TextAnalyzer(txt).fullPass() # print("\n\n") # splitData = " ".join(data.split()[1000:2000]) TextAnalyzer(data).fullPass() # TextAnalyzer(splitData).fullPass() import pdb #pdb.set_trace() #print (wn.synsets("bird")[0].lemma_names()) #print (wn.synsets("cat")[0].definition()) #print (TextAnalyzer.upgrade_word("cat")) test = TextAnalyzer("reading book in winter in the state of michigan") #hit = wn.synsets("hit")[0] # cat = wn.synsets("cat")[0] # one = [] # two = [] # for synset in wn.synsets("cat"): # print (synset.lch_similarity(cat))