def parse_line(line): vid, title, comment = line[:-1].split('~') title = util.clean_text(title) comment = util.clean_text(comment) title_ix = split_to_words(title) if len(title_ix) == 0: return [], [] comment_ix = split_to_words(comment) return title_ix, comment_ix
def LoadFile(self, path): self.SetReadOnly(True) self.Disable() old_path = self.path self.path = path self.sig_title_changed.signal(self) try: text = (yield async_call(read_file, path, "r")) text, self.file_encoding = decode_text(text) text = clean_text(text) self.modified_externally = False self.SetReadOnly(False) self.SetSyntaxFromFilename(path) self.SetText(text) self.SetSavePoint() if old_path: self.env.remove_monitor_path(old_path) self.env.add_monitor_path(path) except: self.path = old_path self.sig_title_changed.signal(self) self.sig_status_changed.signal(self) raise finally: self.Enable() self.SetReadOnly(False)
def classify(input_file, delimiter=",", classifier_param="LinearSVC"): data = [] with open(input_file, 'rU') as f: reader = csv.reader(f, delimiter=delimiter) next(reader, None) # skip the headers for row in reader: person = row[2] text = row[3] likes = row[5] data.append((text, person, likes)) try: classifier = joblib.load("model/%s_classifier.pkl" % classifier_param) except IOError: print "unable to load classifier: %s. Exiting program." % classifier_param sys.exit(1) with open("results/%s_%s" %(classifier_param, input_file.replace("/","-")), 'wb') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quotechar='\"', quoting=csv.QUOTE_ALL) writer.writerow(["label", "message", "author", "likes"]) for message, author, likes in data: cleaned_message = util.clean_text(message, True) if len(cleaned_message.split(" ")) > 3: writer.writerow([util.classify_unknown(classifier, cleaned_message), message, author, likes])
def get_action(hit): address = clean_text(hit['_source']['address']) try: results = geocoding.get(address) except Exception as ex: print address, 'Error:', ex return batch.get_update_action( hit, { 'address': address, 'geocoding_data': None, 'location_error': 'NotFound' }) n = len(results) doc = {} if n > 0: geo = results[0] doc = { 'formatted_address': geo['formatted_address'], 'location': { 'lat': geo['geometry']['location']['lat'], 'lon': geo['geometry']['location']['lng'] } } if n != 1: doc['location_error'] = 'NotFound' if n == 0 else 'Ambiguous' doc['geocoding_data'] = results doc['address'] = address return batch.get_update_action(hit, doc)
def parse_file(self, filename, with_topic=False): """ Reads the text file and returns a dictionary in the form: tweet_id = (sentiment, text) :param with_topic: :param filename: the complete file name :return: """ # print(filename) data = {} filename_print_friendly = filename.split("/")[-1].split("\\")[-1] if self.verbose: print("Parsing file:", filename_print_friendly, end=" ") for line_id, line in enumerate( open(filename, "r", encoding="utf-8").readlines()): try: columns = line.rstrip().split(self.SEPARATOR) tweet_id = columns[0] if with_topic: topic = clean_text(columns[1]) if not isinstance(topic, str) or "None" in topic: print(tweet_id, topic) sentiment = columns[2] text = clean_text(" ".join(columns[3:])) if self.ekphrasis: text = ' '.join(text_processor.pre_process_doc(text)) if text != "Not Available": data[tweet_id] = (sentiment, (topic, text)) else: sentiment = columns[1] text = clean_text(" ".join(columns[2:])) if self.ekphrasis: text = ' '.join(text_processor.pre_process_doc(text)) if text != "Not Available": data[tweet_id] = (sentiment, text) except Exception as e: print("\nWrong format in line:{} in file:{}".format( line_id, filename_print_friendly)) raise Exception if self.verbose: print("done!") return data
def parse_line(line): iid, pid, tags, comment = line[:-1].split('~') comment = util.clean_text(comment) tags = tags.lower() tags_ix = tags.split("#*#") if len(tags_ix) == 0: return [], [] comment_ix = split_to_words(comment) return tags_ix, comment_ix
def Paste(self): wx.TheClipboard.Open() try: text_data = wx.TextDataObject() if wx.TheClipboard.GetData(text_data): text = clean_text(text_data.GetText()) self.ReplaceSelection(text) finally: wx.TheClipboard.Close()
def generate_titles(my_title): my_title = util.clean_text(my_title) my_words = my_title.split(' ') print(' '.join((w.upper() if w in title_word_to_ix else w) for w in my_words) + '\n') my_title_ixs = [title_word_to_ix[w] for w in my_words if w in title_word_to_ix] my_title_sample = util.bag_of_words(my_title_ixs, title_dict_size) for i in range(10): print(' ' + word_ixs_to_str(pred_text(model, my_title_sample), False)) print('')
def create_model(pos_tweets, neg_tweets, neu_tweets, classifier_param='LinearSVC'): # filter away words that are less than 3 letters to form the training training_data tweets = [] for (words, sentiment) in pos_tweets + neg_tweets + neu_tweets: words = util.clean_text(words, True) words_filtered = [e.lower() for e in words.split() if len(e) >= 3] #words_filtered = [' '.join(w) for w in [ x for x in nltk.bigrams(words.split())]] tweets.append((words_filtered, sentiment)) # make sure tweets are shuffled randomly shuffle(tweets) # get the training set and train the Classifier training_set = nltk.classify.util.apply_features(extract_features, tweets) max_specificity = -1 best_classifier = None average_accuracy = 0.0 # perform 10-fold cross validation cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None) for traincv, testcv in cv: if classifier_param == "LinearSVC": classifier = SklearnClassifier(LinearSVC()).train(training_set[traincv[0]:traincv[len(traincv)-1]]) elif classifier_param == "Tfid": # does TF-IDF weighting, # chooses the 1000 best features based on a chi2 statistic, # and then passes that into a multinomial naive Bayes classifier. pipeline = Pipeline([('tfidf', TfidfTransformer()), \ ('chi2', SelectKBest(chi2, k=1000)), \ ('nb', MultinomialNB())]) classifier = SklearnClassifier(pipeline).train(training_set[traincv[0]:traincv[len(traincv)-1]]) elif classifier_param == "Bernoulli": classifier = SklearnClassifier(BernoulliNB()).train(training_set[traincv[0]:traincv[len(traincv)-1]]) elif classifier_param == "NaiveBayes": classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv)-1]]) else: print "Classifier option not available: ", classifier_param sys.exit(1) accuracy_of_classifier, specificity = \ util.accuracy(classifier, tweets[testcv[0]:testcv[len(testcv)-1]]) average_accuracy += accuracy_of_classifier if specificity > max_specificity: max_specificity = specificity best_classifier = classifier print "\naverage accuracy: ", average_accuracy/cv.n_folds # save the classifier joblib.dump(best_classifier, "model/%s_classifier.pkl" % classifier_param) print "saved classifier"
def get_data(data_path, min_count, use_number_norm, embed_path): data_ans = json.load(open(data_path, encoding='utf-8')) data = replace_text(clean_text(data_ans), use_number_norm) new_data = rep_text(data) new_data = [(line[0], line[2]) for line in new_data] # 2 是BIO 1 是 BIEOS char2idx = build_char_vocab(new_data) vocab, word2idx, idx2word, label2index, index2label = build_vocab( new_data, min_count) pretrain_word_embedding, unk_words = build_pretrain_embedding( embedding_path=embed_path, word_index=word2idx) return new_data, pretrain_word_embedding, vocab, word2idx, idx2word, label2index, index2label, char2idx
def WriteFile(self, path): def do_write_file(path, text): mkpath(os.path.dirname(path)) atomic_write_file(path, text) text, self.file_encoding = encode_text(self.GetText(), self.file_encoding) text = clean_text(text) with self.env.updating_path(path): yield async_call(do_write_file, path, text) self.modified_externally = False self.SetSavePoint()
def on_get(self, req, resp): time_response_begin = time.time() is_thumbnail = bool(req.get_param("thumbnail")) if is_thumbnail: cache = self.thumbnail_cache layout = self.thumbnail_layout else: cache = self.image_cache layout = self.image_layout image_data = None # cache hack if req.get_header("If-None-Match") and req.get_header("Cache-Control") != "no-cache": resp.status = falcon.HTTP_304 return text = clean_text(req.get_param("text", True)) if len(text) > TEXT_MAX_LEN: resp.status = falcon.HTTP_403 resp.body = "Text to long." return # if text has no newline, assume space is the newline if "\n" not in text: text = re.sub(r" +", "\n", text) text = re.sub(r"_+", " ", text) image_data = cache.get(text) cache_hit = image_data is not None if not cache_hit: time_render_begin = time.time() # stats with layout.base_image.clone() as canvas: self.draw_text(layout, canvas, text) image_data = canvas.make_blob("png") cache.set(text, image_data) render_time = time.time() - time_render_begin # stats self.latest_cache_miss_times.append(render_time) # stats resp.set_header("Cache-Control", "public, max-age=3600") resp.set_header("Content-Type", "image/png") resp.set_header("ETag", hashlib.md5(text.encode("utf-8")).hexdigest()) resp.body = image_data now = time.time() if cache_hit: cache_hit_time = time.time() - time_response_begin response_time = now - time_response_begin self.latest_response_times.append(response_time) if cache_hit: self.latest_cache_hit_times.append(cache_hit_time)
def write_tweet_info(tweet_object, options): debug_print("Writing tweet info") output_path = Path(options['output_dir']) / options['search_name'] / (options['search_name'] + '.txt') with output_path.open('a') as output_file: output_file.write('Tweet info:\n') output_file.write('\tID:{}\n'.format(tweet_object['id_str'])) output_file.write('\tCreated at:{}\n'.format(tweet_object['created_at'])) output_file.write('\tAuthor:{}\n'.format(tweet_object['user']['screen_name'])) output_file.write('\tText:{}\n'.format(clean_text(tweet_object['text']))) try: output_file.write('\tURL:{}\n'.format(tweet_object['entities']['urls'][0]['expanded_url'])) except Exception: pass
def clean(src): src['name'] = clean_text(src['name']) src['mission'] = deep_try_get(src, 'mission', 'gs_data.mission', 'fb_data.mission') src['city'] = deep_try_get(src, 'city', 'source_data.city', 'gs_data.city') src['state'] = deep_try_get(src, 'state', 'gs_data.state') src['link'] = deep_try_get(src, 'link', 'source_data.website', 'gs_data.website') keywords = deep_try_get(src, 'keywords', 'gs_data.exchange.keywords') if keywords: src['keywords'] = clean_list(keywords.split(',')) if 'areas' in src: src['areas'] = clean_list(src['areas']) value = deep_get(src, 'gs_data.geographic_areas_served') if value: deep_set(src, 'gs_data.geographic_areas_served', clean_list(value)) value = deep_get(src, 'gs_data.organization_ntee_codes') if value: deep_set(src, 'gs_data.organization_ntee_codes', clean_list(value)) value = deep_get(src, 'tw_data.created_at') if value: deep_set(src, 'tw_data.created_at', parse_date(value).isoformat()) value = deep_get(src, 'tw_data.status.created_at') if value: deep_set(src, 'tw_data.last_update_at', parse_date(value).isoformat()) del src['tw_data']['status']['created_at'] value = deep_get(src, 'fb_data.location') if value: deep_set(src, 'fb_data.address', value) del src['fb_data']['location'] value = deep_get(src, 'fb_data.address.latitude') if value: deep_set(src, 'fb_data.location.lat', value) value = deep_get(src, 'fb_data.address.longitude') if value: deep_set(src, 'fb_data.location.lon', value) if 'isSiteDown' in src: src['is_site_down'] = src['isSiteDown'] del src['isSiteDown']
def predict_sentiment(texts): model = load_model() tokenizer = load_tokenizer() for text in texts: texts[texts.index(text)] = clean_text(text) texts = np.array(texts) sequences = tokenizer.texts_to_sequences(texts) data = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=50) prediction = model.predict(data) keras.backend.clear_session() return prediction.tolist()
def sentiment_analysis_result(input_review, *args, **kwargs): # TODO: returm sentiment analysis on input tweet # 1: positive, 0 negative modelfilename, bowTransformefilename, TfidfTransformefilenam = args Model = pickle.load(open(modelfilename, "rb")) bow_transformer = pickle.load(open(bowTransformefilename, "rb")) tfidf_transformer = pickle.load(open(TfidfTransformefilenam, "rb")) input_review = clean_text(input_review) input_review = [input_review] #convert it to the list because transform needs a iterable object messages_bow_test = bow_transformer.transform(input_review) messages_transformer_test = tfidf_transformer.transform(messages_bow_test) result = Model.predict(messages_transformer_test) return result[0]
def embedding_route(): response = { "text": None, "embeddings": None, "err": None } try: a = request.args.get("txt") x = clean_text(a) response['text'] = a response['embeddings'] = str(w2v.vector(x)) return jsonify(response) except Exception as err: response['err'] = str(err) return jsonify(response)
def scrape_webpage(root, dir_, format_, lang="", needs_check=False): """ Navigates to input URL and parses HTML for relative links, scraping each page for its text """ text_dict = dict() queue = ScrapeQueue(root) while len(queue) > 0: # Get html data from url on queue url = queue.dequeue() print(f"Scraping {url}...") r = requests.get(url) # Log bad request to console if not r.status_code == 200: print(f"Failed to scrape {url}.\n Status code: {r.status_code}\n") continue # Using lmxl parser and utf-8 to account for various charsets soup = BeautifulSoup(r.content, "lxml", from_encoding="utf-8") hrefs = yank_hrefs(root, url, soup.find_all("a")) # set instance if not hrefs: print(f"Error occured in scraping links from {url}.\n\ Check status of chromedriver executable and try again.") return -2 # Add found links to queue for h in hrefs: queue.enqueue(h) # Add contents to text_dict title = soup.get("title") text = soup.get_text(separator=",") if not lang: lang = detect_lang(text) # Set lang text_dict[title] = clean_text(text) # Text passed as generator to lower mem load print() if needs_check: return check_spelling(text_dict) return save_scrapings(root, dir_, format_, text_dict, lang)
def gposttl(utterance, identifier="no_id"): utterance = clean_text(utterance) p = subprocess.Popen(['gposttl', '--silent'], stdout=subprocess.PIPE, stdin=subprocess.PIPE) (out, error) = p.communicate(utterance.encode('utf8')) if p.returncode != '0': if len(out) > 1: out = out.decode('utf8') tokens = [a.replace(u'\t', u'|') for a in out.split(u'\n') if len(a) > 0] return tokens else: raise Exception('Error: no text to parse') else: msg = str(error) code = str(p.returncode) sys.stderr.write(utterance+'\n\n') raise Exception(': '.join([identifier, code, msg]))
def prediction(review): global classifier global _cv global _tfidf cleaned_review = util.clean_text(review) cleaned_review = [cleaned_review] vect = _cv.transform(cleaned_review).toarray() rev = _tfidf.transform(vect) rev_sparse_tensor = convert_sparse_matrix_to_sparse_tensor(rev) ordered_sparse_tensor = tf.compat.v1.sparse.reorder(rev_sparse_tensor) my_prediction = classifier.predict(ordered_sparse_tensor) predicted_value = my_prediction.item(0) print(predicted_value) if predicted_value >= 0.5: prediction_status = "Great! This is the Positive review." print("Hello1") elif predicted_value < 0.5: prediction_status = "Oops! This is the Negative review." print("Hello2") return prediction_status
def read_data(file_path): with open(file_path, "r", encoding="utf8") as inFile: lines = inFile.readlines() datas = [] for index, line in enumerate(tqdm(lines, desc="read_data")): # 입력 문장을 \t으로 분리 pieces = line.strip().split("\t") # 데이터의 형태가 올바른지 체크 #assert len(pieces) == 3 if len(pieces) != 3: continue if(index == 0): continue pieces[1] = clean_text(os.path.join(config["root_dir"],"data"), pieces[1]) id, sequence, label = pieces[0], pieces[1], int(pieces[2]) datas.append((id, sequence, label)) return datas
def predict(): params = json.loads(request.data.decode("utf-8")) inputs = params['inputs'] random = True if params.get('random', False) == True or params.get( 'random', False) == 'true' else False temperature = float(params.get('temperature', 1.)) top_k = int(params.get('topk', 1)) number = int(params.get('number', 1)) kind = params.get('kind', 'word') if kind == 'para': nb_para = number nb_sentence = -1 nb_word = -1 elif kind == 'sentence': nb_para = 1 nb_sentence = number nb_word = -1 else: nb_para = 1 nb_sentence = -1 nb_word = number config = { 'inputs': inputs, 'random': random, 'temperature': temperature, 'top_k': top_k, 'nb_word': nb_word, 'nb_sentence': nb_sentence, 'nb_para': nb_para, } input_words = clean_text(inputs) if len(input_words) == 0: input_words.append("") y = rnn.predict(input_words, config) return json.dumps({'config': config, 'output': ' '.join(y)})
def de_duplicate(files, output): """ Input a list of JL files, output a single processed JL file """ items = {} punct_regex = re.compile('[%s]' % re.escape(string.punctuation)) for pth in files: with open(pth, 'rb') as reader: lines_no = 0 for line in reader: try: item = json.loads(line) except: continue lines_no += 1 # Fix and normalize text item['text'] = clean_text(item['text']) # Lower and strip punctuation txt = punct_regex.sub('', item['text'].lower()) # Drop extra spaces txt = ' '.join(txt.split()) if txt in items: # Merge existing tags item['tags'].extend(items[txt]['tags']) item['tags'] = sorted(set(t.lower() for t in item['tags'])) else: # Normalize tags item['tags'] = sorted(t.lower() for t in item['tags']) # The dict KEY will automatically overwrite duplicates items[txt] = item print(f'Read {lines_no} items from "{pth}".') out_items = sorted(items.values(), key=lambda x: x['author'].lower()) print(f'Written {len(out_items)} items in "{output}".') json.dump(out_items, open(output, 'w'))
def GetPredictionOnEvalSet(modelfilename,tokenizer): global q_max_words,p_max_words,emb_dim max_para_length = 362 max_query_length = 38 model = load_model('./trained_model/'+modelfilename+'.h5') #f = open(testfile,'r',encoding="utf-8") all_scores={} # Dictionary with key = query_id and value = array of scores for respective passages queryE, paraE= util.loadEvalData() if tokenizer==None: with open('./data/tokenizer-Stemed2Lqueries.pickle', 'rb') as handle: tokenizer=pickle.load(handle) keys = list(queryE.keys()) print(len(keys)) for i in range(len(keys)): Xquery, Xpara = list(), list() query_id = keys[i] # retrieve query query = queryE[query_id] query = util.clean_text([query]) # retrieve text input paralist = paraE[query_id] cleanparalist = list() for j in range(len(paralist)): cleanparalist.append(util.clean_text([paralist[j]])) # generate input-output pairs query_seq = tokenizer.texts_to_sequences([query])[0] para_seq = tokenizer.texts_to_sequences(cleanparalist) # split one sequence into multiple X,y pairs padded_query_seq = pad_sequences([query_seq], maxlen=max_query_length)[0] #print(padded_query_seq) padded_para_seq = pad_sequences(para_seq, maxlen=max_para_length) for k in range(len(paralist)): Xquery.append(padded_query_seq) Xpara.append(padded_para_seq[k]) score = model.predict([array(Xquery), array(Xpara)],verbose=0) # do forward-prop on model to get score score=score[:,1] # extract 1 column at index 1 if(query_id in all_scores): all_scores[query_id].append(score) else: all_scores[query_id] = [score] text = "\r{0} {1}".format("Done queries: ", i) sys.stdout.write(text) sys.stdout.flush() fw = open("answer.tsv","w",encoding="utf-8") for query_id in all_scores: scores = all_scores[query_id] s="" for sc in scores: for value in sc: value=format(value,'f')#.replace("\n","").replace("[","").replace("]","") s=s+ str(value) + "\t" s=re.sub(' {2,}', ' ', s) s=re.sub(' ', '\t', s) fw.write(str(query_id) + "\t" +s.rstrip("\t") + "\n") ''' scores_str = [str(sc) for sc in scores] # convert all scores to string values scores_str = "\t".join(scores_str) # join all scores in list to make it one string with tab delimiter. re.sub("[|]", "", scores_str) scores_str.replace(" ","\t") fw.write(str(query_id)+"\t"+ scores_str + "\n") ''' fw.close()
def get_action(hit): address = hit['_source']['address'] m = re.search(r'.*[\r\n\t\s]+([\w\W]+?)(tel|EIN):', address, re.M) address = clean_text(m.group(1)) return batch.get_update_action(hit, {'address': address})
def test_clean_text(self): text = 'æœìíîïýÿòóôõöáâãäëñûüx2,X2' cleaned_text = util.clean_text(text) true_text = ['aeoeiiiiyyoooooaaaaenuu', ','] self.assertEqual(cleaned_text, true_text)
from sklearn.decomposition import PCA import seaborn as sns sns.set() sns.set_context('poster') nltk.download('state_union') nltk.download('gutenberg') # Set current corpus (state_union, gutenberg) corpus = gutenberg textnames = corpus.fileids() corpusnames = {gutenberg : 'Gutenberg books', state_union : 'State union documents'} print("cleaning texts..") clean_texts = {name : clean_text(corpus.raw(name)) for name in tqdm(textnames)} print("Calculating frequencies..") freqs = {name : nltk.FreqDist(text) for name, text in tqdm(clean_texts.items())} # Create complete vocabulary wordlist = set() for freq in freqs.values(): wordlist.update(freq.keys()) vocabulary = {word : idx for (idx, word) in enumerate(wordlist)} # Convert texts to wordcount vectors word_vectors = {name : np.zeros(len(wordlist,)) for name in textnames} for textname, vector in word_vectors.items(): for word, freq in freqs[textname].items(): vector[vocabulary[word]] += freq
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk import util # Doesn't work very well in french ... LANGUAGE = "english" text_file = util.get_text(LANGUAGE) text_str = text_file.read() text_str = nltk.word_tokenize(text_str, language=LANGUAGE) text_str = util.clean_text(text_str, LANGUAGE) text_tag = nltk.pos_tag(text_str) nltk_text = nltk.Text(text_str) print type(nltk_text) # CHUNKING sentence = nltk.word_tokenize("Bouteflika is the president of Algeria.") sentence = nltk.pos_tag(sentence) # grammar = "Actor: {<DT>?<JJS>*<NNP>+}" # jj adjectif # chunk= nltk.RegexpParser(grammar) # result = chunk.parse(text_tag)
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, Washington, USA, """ import util negative_stop_words = util.get_file_content_as_list( "resources/negative-stop-words.txt") neutral_stop_words = util.get_file_content_as_list( "resources/neutral-stop-words.txt") positive_stop_words = util.get_file_content_as_list( "resources/positive-stop-words.txt") negative_words = util.get_file_content_as_list("resources/negative-words.txt") positive_words = util.get_file_content_as_list("resources/positive-words.txt") original_text = util.format_into_string( util.get_file_content_as_list("resources/text.txt")) text = util.clean_text(util.get_file_content_as_list("resources/text.txt"), neutral_stop_words) sentiment = util.analyze_text_sentiment(text, negative_stop_words, positive_stop_words, negative_words, positive_words) print("The original text was: " + original_text) print("\nThe sentiment score was: " + str(sentiment)) print("This corresponds to a " + util.mood(sentiment) + " mood ... " + util.emoji(sentiment) + "\n")
"--nb_para", default=1, type=int, help= "How many paragraph should it return (default: %(default)s, -1: no limit)") # parser.add_argument('--use_server', nargs="?", const=True, default=False, type=bool, help='Should use the Server architecture') args = parser.parse_args() results_dir = dir + '/results' rnn_dir = dir + '/' + args.model_dir config = vars(args) config['log_dir'] = rnn_dir config['restore_embedding'] = False config['seq_length'] = None input_words = clean_text(config['inputs']) # if args.use_server is True: # with open('clusterSpec.json') as f: # clusterSpec = json.load(f) # config['target'] = 'grpc://' + clusterSpec['server'][0] # pass rnn = RNN(config) y = rnn.predict(input_words, config) print('__BBB_START__') # Marker for the Regexp used in the App, do not remove json = json.dumps({ 'config': { 'inputs': args.inputs, 'random': args.random, 'temperature': args.temperature, 'top_k': args.top_k,