def lemmatize(input,multiList=False,cascade=True): stemmed = [] if cascade == True: if multiList == False : filtered = stopwordremover.remove_stop_word(input) else: filtered = stopwordremover.remove_stop_word(input,True) for word in filtered: stemmed.append(lemmatizationEngine(word)) stemmed = list(set(stemmed)) return stemmed else : if multiList == False: lst = normalizer.normalize(input) else : lst = normalizer.normalize(input,True) for word in lst: stemmed.append(lemmatizationEngine(word)) #stemmed = list(set(stemmed)) return stemmed
def test_asciichars(self): """Fix ASCII characters""" self.assertEqual(norm.normalize("What’s up"), "what is up") self.assertEqual(norm.normalize("What's up"), "what is up") self.assertEqual(norm.normalize("I said “shut up”"), 'I said "shut up"') self.assertEqual(norm.normalize("œ"), '')
def test_replacesubstitutes(self): """should replace subsitutes""" self.assertEqual(norm.normalize("Nov 1st I weighed 90 kgs. total"), "November 1st I weighed 90 kilograms total") self.assertEqual( norm.normalize("I shared it on FB w/ friends, ie: you"), "I shared it on Facebook with friends, for example : you")
def remove_stop_word(input, multiList=False): if isinstance(input, str): filtered = [ word for word in normalizer.normalize(input) if (word not in stopwords['english'] and not word.isdigit()) ] return filtered if not isinstance(input, basestring): if multiList == True: for index, lst in input: input[index] = [ word for word in normalizer.normalize(lst) (word not in stopwords['english'] and not word.isdigit()) ] return input else: input = [ word for word in normalizer.normalize(input)( word not in stopwords['english'] and not word.isdigit()) ] return input #print tokenizer.tokenize('sdfdsf sdffsd sdfsdfds') #print remove_stop_word('hello i Am mayank. I Am a Good boy')
def test_contractions(self): """should expand contractions""" self.assertEqual(norm.normalize("I'm on the yelow zebra"), "I am on the yellow zebra") self.assertEqual(norm.normalize("I'll listen to y'all"), "I will listen to you all") self.assertEqual(norm.normalize("do n't make it right"), "do not make it right") self.assertEqual(norm.normalize("it's all good"), "it is all good")
def echo(self, data, start, end): if (self.audio_file is not None): recording = self.asource.read() else: recording = b''.join(data) print("Acoustic Activity at: {0}--{1}".format(start, end)) print(recording) #data = np.array(data) #serialized = np.frombuffer(data) #print(len(hex_data)) #print(len(recording)) normalize(recording) pad_tokens('tmp.wav2') sample_rate, normalized_signal = wavfile.read('tmp.wav2') print(sample_rate) print(len(normalized_signal)) banks = convert_to_mel(normalized_signal) banks = np.array(banks) Banks = banks.reshape(1, 98, 40, 1) #np.save('test.npy', Banks) z = self.model.predict(Banks) p = z[0].tolist().index(max(z[0])) self.recognized_keyword = self.categories[p] print(self.recognized_keyword) ''' frame_length, step_size = 16000, 64000 no_of_shifts = int(64000 / step_size) - int(frame_length / step_size) print(no_of_shifts) #keyword = None prob = 0 for i in range(0, no_of_shifts): l = int(i*320) banks = convert_to_mel(normalized_signal[l:l+frame_length]) banks = np.array(banks) np.save('test.npy', banks) #banks = np.load('sd.npy') Banks = banks.reshape(1, 98, 40, 1) z = self.model.predict(Banks) print(z) p = z[0].tolist().index(max(z[0])) prob += p self.recognized_keyword = self.categories[p] #self.recognized_keyword = self.categories[np.argmax(z[0], -1)] #com = self.commands[p] print(self.recognized_keyword) final = int(np.ceil(prob/no_of_shifts)) print(prob/no_of_shifts) print(final) self.recognized_keyword = self.categories[final]''' K.clear_session() os.remove('tmp.wav2') os.remove('tmp.wav')
def load_problem(dom_name): print "Parsing..." p = parser.Problem(dom_name) p.max_faults = -1 print "Normalizing..." for a in p.actions: normalize(a) print "Ready!" return p
def merge_clauses(sentences): """ этот метод получает json в виде списка "язык-предлоэение" { 'ru':russian_sentence 'en':english_sentence } возращает список клауз + код ответа { 'clauses': [{'ru':rus_clause_i.'en':eng_clause_i}] [{'ru':rus_clause_i.'en':eng_clause_i}] [{'ru':rus_clause_i.'en':eng_clause_i}] response: {code:0, description:''} } """ input = json.loads(sentences) sent_rus = input['ru'] sent_en = input['en'] sent1 = json.loads(split_to_clauses(json.dumps({'ru':sent_rus}))) sent2 = json.loads(split_to_clauses(json.dumps({'en':sent_en}))) checking = check_stream(sent1, sent2) if checking['code'] == 1: zipped_clauses = zip(sent1['clauses'], sent2['clauses']) else: zipped_clauses = None #пока что zipped_clauses[i][0] - русские клаузы, zipped_clauses[i][2] - английские #со временем необходимо переделать метод под произвольные языки #инициализируем переменную для вывода output = {'clauses':[],'response':checking} if(zipped_clauses): for pairs in zipped_clauses: output['clauses'].append({'ru':normalizer.normalize(pairs[0]),'en':normalizer.normalize(pairs[1])}) else: output['clauses'] = None print json.dumps(output) return json.dumps(output)
def evaluate(model, dev_data, loss_fn, save=False): print("Running evaluation...") model.eval() length = len(dev_data) # loss metrics l2_loss_fn = loss_fns.L2Loss() all_loss = [] l2_losses = [] for t, (x, y) in enumerate(dev_data): x_copy = np.copy(x.numpy()) x_var = Variable(normalize(x).permute(0, 3, 1, 2)).type(dtype) y_var = Variable(normalize(y).permute(0, 3, 1, 2)).type(dtype) scores, _, C, M1, M2, res_img1, res_img2 = model(x_var) if (t >= length - 2 and save): extra = results_folder + "extra/" os.makedirs(extra, exist_ok=True) for i in range(NUM_SAVED_SAMPLES): name = results_folder + "{}_{}_".format(t, i) convert_and_save(name + "gen.png", scores[i]) convert_and_save(name + "gold.png", y_var[i]) try: convert_and_save(extra + "resgen1.png", res_img1[i]) convert_and_save(extra + "resgen2.png", res_img2[i]) except Exception: print(traceback.format_exc()) # np.save(name + 'C', C.data.cpu().numpy()) try: np.save(extra + 'M1', M1.data.cpu().numpy()) np.save(extra + 'M2', M2.data.cpu().numpy()) except Exception: print(traceback.format_exc()) # convert_and_save(name + "__Cx.png", ) x_res = x_copy[i] try: imsave(extra + "orig_0.png", x_res[:, :, :3]) imsave(extra + "orig_1.png", x_res[:, :, 3:]) except Exception: print(traceback.format_exc()) all_loss.append(calculate_norm_loss(x_var, y_var, scores, loss_fn)) l2_losses.append(calculate_norm_loss(x_var, y_var, scores, l2_loss_fn)) total_loss = sum(all_loss) / len(all_loss) total_l2_loss = sum(l2_losses) / len(l2_losses) print("Eval norm l2 loss: %.4f, norm total loss: %.4f" % (total_l2_loss, total_loss)) return total_loss
def _load(self): """ Validates and normalizes Batch data Updates member `loaded_status` with `OK`, `BATCH_NO_DATA`, `BATCH_NOT_VALID` or `BATCH_NOT_NORMALIZED` :return: None """ if self._data is None: logging.info('No data was found') self.load_status = BATCH_NO_DATA return status, message = validate(self._data) if status != OK: logging.info('Validation failed : ' + message) self.load_status = BATCH_NOT_VALID return self.name = self._data['name'] self.icon_path = parse(self._data['icon_path']) tags, tasks, status = normalize(self._data) if status != OK: logging.info('Batch normalization failed') self.load_status = BATCH_NOT_NORMALIZED return self.tags = tags self.tasks = tasks self.load_status = OK
def train(self,X,y): theta0 = self.roll(self.theta) X,self.mean,self.std = normalize(X) self.nTrainingExamples = X.shape[0] results = minimizer(lambda x: self.cost_function(X,y,x),theta0,approx_grad = False) self.theta = self.unroll(self.theta,results[0]) return results
def react(self, status): from_user = status.author.screen_name text = N.normalize(status.text) reply = self.lang.gen(text) print(from_user, text, reply) reply = "@{} {}".format(from_user, reply)[0:140] self.api.update_status(reply, status.id_str)
def process(): # Mengakses data form dari request HTTP text = request.form.get("text", "") # Melakukan preprocessing text = preprocess(text) # Melakukan tagging text = tag(text, "http://localhost:7000") # Melakukan chunking text = chunk(text) # Melakukan proses normalisasi text = normalize(text) # Membuat response HTTP dengan format JSON yang berisi teks yang telah diproses return jsonify({ "status": "success", "message": "Request successful", "data": { "text": text } })
def test_normalizer(self): norm_file = csv_functions.csv_open('test_norm.csv') expected = [[100.0, 0], [90.0, 5], [80.0, 7.5], [70.0, 25], [60.0, 40], [50.0, 50], [40.0, 40], [30.0, 22.5], [20.0, 17.5], [10.0, 7.5], [0.0, 2.5]] actual = csv_functions.csv_open('test_1.csv') actual = pixel_to_embryo_length.pixel_to_embryo_length(actual) normalizer.normalize(actual, norm_file) self.assertEqual(expected, actual) expected = [[100.0, 0], [90.0, 4], [80.0, 6], [70.0, 20], [60.0, 32], [50.0, 40], [40.0, 32], [30.0, 18], [20.0, 14], [10.0, 6], [0.0, 2]] actual = csv_functions.csv_open('test_2.csv') actual = pixel_to_embryo_length.pixel_to_embryo_length(actual) normalizer.normalize(actual, norm_file) self.assertEqual(expected, actual)
def parse(self): t = tokenizer.Tokenizer() for word in t.get_tokens(normalize(self.file_name)): self.process(word) if self.save: self.dictionary.save() return 0
def is_subset(a, b): ''' params a, b are expressions in string return True if a ⊆ b or False if a ⊈ b ''' import parser import normalizer if type(a) is str: a = parser.parse(a) a = normalizer.normalize(a) if type(b) is str: b = parser.parse(b) b = normalizer.normalize(b) return _is_subset(a, b)
def train(model, loss_fn, optimizer, train_data, num_epochs = 1): for epoch in range(num_epochs): print('Starting epoch %d / %d...' % (epoch + 1, num_epochs)) model.train() for t, (x, y) in enumerate(train_data): x_var = Variable(normalize(x).permute(0,3,1,2)).type(dtype) y_var = Variable(normalize(y).permute(0,3,1,2)).type(dtype) scores = model(x_var) loss = loss_fn(scores, y_var) if (t + 1) % PRINT_EVERY == 0: print('\tt = %d, loss = %.4f' % (t + 1, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step()
def remove_stop_word(input,multiList=False): if isinstance(input,str): filtered = [word for word in normalizer.normalize(input) if (word not in stopwords['english'] and not word.isdigit())] return filtered if not isinstance(input, basestring): if multiList == True: for index,lst in input: input[index] = [word for word in normalizer.normalize(lst) (word not in stopwords['english'] and not word.isdigit())] return input else : input = [word for word in normalizer.normalize(input) (word not in stopwords['english'] and not word.isdigit())] return input #print tokenizer.tokenize('sdfdsf sdffsd sdfsdfds') #print remove_stop_word('hello i Am mayank. I Am a Good boy')
def read_url(url): checked_links.append(url) url = n.normalize(url, main_url_domain, main_url_ext) # check normalizer.py mailto: condition print("Fetching page at {}...".format(url), end='') if url is not None: try: url_request = requests.get(url) except Exception: print("Could not read url...") return None print("...done") if url != main_url: print("Checking: ", url) url_domain = s.extract(url)["url_domain"] else: url_domain = main_url_domain is_ok = True if url_request.status_code >= 400: broken_links.append(url) is_ok = False write_broken = url + "," + str(url_request.status_code) + "\n" broken_file.write(write_broken) print("* Broken url: ", url) print("") return None soup = BeautifulSoup(url_request.content, "html.parser", from_encoding="iso-8859-1") print("Looking for links on the webpage...", end='') url_list = soup.find_all('a', href=True) print("...done") print("") write_checked = url + "," \ + str(url_request.status_code) + "," + str(is_ok) + "\n" checked_file.write(write_checked) if url_domain == main_url_domain: for link in url_list: if not link['href']: continue if link['href'] not in checked_links: read_url(link['href'])
def pos_tags_count(text): pos_counts = {} parsed_tokens = normalize(text) pos_tags = get_only_pos(parsed_tokens) unique_tags = ['VERB','ADJ','NOUN','ADV','NUM','SCONJ','CCONJ','CONJ'] for tag in unique_tags: pos_counts[tag] = pos_tags.count(tag)/len(pos_tags) pos_counts['CONJ'] = pos_counts['SCONJ']+pos_counts['CCONJ'] del pos_counts['SCONJ'] del pos_counts['CCONJ'] return pos_counts
def team_rate_extractor(local=False): soup = get_html_soup(LOCAL_PATH, LINK, local) con = soup.find(id='pageContent').find(attrs={'class': 'content'}) tables = con.find_all('table') data = [] for t in tables: data.extend(parse_table(t)) teamrating = [] for team in data: if len(team) > 0: if len(team) >= 5: teamrating.append([normalize(team[-5]), team[-1]]) else: teamrating.append([normalize(team[-2]), team[-1]]) teamrating = sorted(teamrating, key=lambda x: float(x[1]), reverse=True) teamrating = list(zip(*teamrating))[0] return teamrating
def handle(self): global counter data = bytes.decode(self.request[0].strip()) socket = self.request[1] print("%s : " % self.client_address[0], str(data)) body = normalize(str(data)) today = body['dt'].strftime('%Y-%m-%d') result = es.index(index=today, doc_type='event', body=body) if not result['created']: logging.info(str(data)) counter += 1 print("Got", counter, "messages")
def train(model, loss_fn, optimizer, train_data, val_data, num_epochs=1): losses = [] eval_losses = [] optimizer = optim.Adam(model.parameters(), lr=INIT_LR * 10 ** -4) # slow start (to prevent blowup for epoch in range(num_epochs): print('Starting epoch %d / %d...' % (epoch + 1, num_epochs)) model.train() if epoch == 6: print("Lowering rate for refinement") optimizer = optim.Adam(model.parameters(), lr=INIT_LR / 10) if epoch == 11: print("Lowering rate for refinement 2") optimizer = optim.Adam(model.parameters(), lr=INIT_LR / 100) for t, (x, y) in enumerate(train_data): if epoch == 0 and t == 50: optimizer = optim.Adam(model.parameters(), lr=INIT_LR) # print(t) x_var = Variable(normalize(x).permute(0, 3, 1, 2)).type(dtype) y_var = Variable(normalize(y).permute(0, 3, 1, 2)).type(dtype) scores, oob_loss, _, _, _, _, _ = model(x_var) loss = loss_fn(scores, y_var) if (t + 1) % PRINT_EVERY == 0: norm_loss = calculate_norm_loss(x_var, y_var, scores, loss_fn) losses.append(norm_loss) print('\ttraining: t = %d, loss = %.4f, norm_loss= %.4f' % ( t + 1, loss.data[0], norm_loss)) if not is_local and t % (len(train_data) // 8) == 0 or overfit_small: eval_loss = evaluate(model, val_data, loss_fn) eval_losses.append(eval_loss) optimizer.zero_grad() (loss + oob_loss).backward() optimizer.step() os.makedirs("losses", exist_ok=True) np.save(results_folder + 'losses' + NAME, np.array(losses)) np.save(results_folder + 'losses/eval_losses' + NAME, np.array(eval_losses))
def eval(model, dev_data, loss_fn): print("Running evaluation...") total_loss = 0.0 model.eval() length = len(dev_data) for t, (x, y) in enumerate(dev_data): x_var = Variable(normalize(x).permute(0,3,1,2)).type(dtype) y_var = Variable(normalize(y).permute(0,3,1,2)).type(dtype) scores = model(x_var) if (t == length-1): for i in range(NUM_SAVED_SAMPLES): name = "./eval/{}_{}_".format(t, i) imsave(name + "gen.png", np.transpose(denorm(scores[i].data.cpu().numpy()), axes=[1,2,0])) imsave(name + "gold.png", np.transpose(denorm(y_var[i].data.cpu().numpy()), axes=[1,2,0])) x = x_var[i].data.cpu().numpy() imsave(name + "orig_0.png", x[:3,:,:]) imsave(name + "orig_1.png", x[3:,:,:]) total_loss += loss_fn(scores, y_var).data[0] print("Total eval loss: %.4f, Avg eval loss: %.4f" % (total_loss, total_loss / NUM_VAL))
def test_can_normalize_data(self): lines = load_test_data('weather.dat') normal = normalize(lines) first_measurement = normal[0] last_measurement = normal[-1] self.assertEqual(1, first_measurement.day) self.assertEqual(59, first_measurement.min) self.assertEqual(88, first_measurement.max) self.assertEqual(29, first_measurement.delta()) self.assertEqual(30, last_measurement.day) self.assertEqual(45, last_measurement.min) self.assertEqual(90, last_measurement.max) self.assertEqual(45, last_measurement.delta())
def calculate_score(input_data): result = COEFS.copy() for i in range(len(result)): result[i].append(input_data[i]) for idx, row in enumerate(COEFS): result[idx].append(row[2] * row[-1]) score = INTERCEPT for row in result: score += row[-1] return normalize(score)
def team_extractor(local=False): soup = get_html_soup(LOCAL_PATH, LINK, local) con = soup.find(id='pageContent').find(attrs={'class': 'content'}) tables = con.find_all('table') data = [] for t in tables: data.extend(parse_table(t)) teams = {} for team in data: if len(team) > 0: teams[normalize(team[-4])] = [team[-1], team[-2], team[-3]] return teams
def weigh_match(self, pair): init_str = pair[1] query_str, completion_str = normalize(pair) # пропускает, если подсказок нет или запрос - это не буквы if completion_str == 'NULL' or re.fullmatch('\W+', query_str): return False compare_obj = Compare(query_str, completion_str, init_str) Compare.calculate_weight(compare_obj) query_weight = compare_obj.max_obj.weight # выбирает пары с нужным расстоянием Левенштейна if 0 <= query_weight <= 2: self.light_match = compare_obj.max_obj else: self.light_match = None
def sentences_to_indices(X, word_to_index, max_len): m = X.shape[0] X_indices = np.zeros((m, max_len), dtype=int) for i in range(m): sentence_words = normalize(X[i]).split() j = 0 for w in sentence_words: if w in word_to_index: X_indices[i, j] = word_to_index[w] j = j + 1 return X_indices
def read_url(url): global count url = n.normalize(url, main_url_domain, main_url_ext) #check normalizer.py mailto: condition if url is not None: url_request = requests.get(url) count += 1 print(count) url_domain = s.extract(url)["url_domain"] is_ok = True if url_request.status_code >= 400: broken_links.append(url) is_ok = False write_broken = url + "," + str(url_request.status_code) + "\n" broken_file.write(write_broken) print(url_request.status_code) soup = BeautifulSoup(url_request.content, "html.parser", from_encoding="iso-8859-1") url_list = soup.find_all('a', href=True) checked_links.append(url) write_checked = str(count) + "," + url + "," + str( url_request.status_code) + "," + str(is_ok) + "\n" checked_file.write(write_checked) if url_domain == main_url_domain: for link in url_list: if link['href'] not in checked_links: print(link['href']) read_url(link['href'])
def run_episode(self, env, normalizer, addOrSubtractOperator, delta=None, render=False): """Gets the total reward for an episode""" total_reward = 0 state = env.reset() for episode_number in range(self.options['MAX_EPISODES']): if render: env.render() normalizer.observe(state) state = normalizer.normalize(state) action = self.policy(state, addOrSubtractOperator, delta) state, reward, done, info = env.step(action) reward = max(min(reward, 1), -1) total_reward += reward if done: break env.env.close() return total_reward
def scoreboardextractor(local=False): soup = get_html_soup(LOCAL_PATH, LINK, local) con = soup.find('table') rows = con.find('tbody').find_all('td') scoreboard = [] for row in rows: if row.text and '1' not in row.text and len( row.text) > 4 and 'tries' not in row.text: text = row.text for key in REGIONS: if text.startswith(key) and not REGIONS[key]: # print(text) text = text[len(key):] REGIONS[key] = True scoreboard.append(normalize(text)) # for i in range(len(scoreboard)): # print(i, scoreboard[i]) return scoreboard
def calculate_score(input_data): """ Given the intercet and model coefficients, this function calculates the score of an input """ result = COEFS.copy() # find answers that were true and flag as 1, 0 otherwise for idx, row in enumerate(COEFS): if row[1] in input_data.keys(): result[idx].append(1) else: result[idx].append(0) # multiply the flag by the coefficient to get the points for idx, row in enumerate(COEFS): result[idx].append(row[2] * row[-1]) # sum all points and the intercept score = INTERCEPT for row in result: score += row[-1] return normalize(score)
def do_predict_multiple(data): output = {} count = 0 for l in xrange(len(data) - 1, MIN_LENGTH, -1): # for l in xrange(MIN_LENGTH, len(data)): for i in xrange(len(data) - l + 1 - 1, -1, -1): if count > max_analysis_count: break current = normalizer.normalize(np.array(data)[i:i+l]) result = model.predict(current)[0] result = LABEL_LIST[int(result)] if result not in output: output[result] = 1 else: output[result] += 1 if result != 'random' and result != 'horizontal' and result != 'vertical': count += 1 return output
def process_data_row(label, file_name): data = np.genfromtxt(file_name, delimiter=', ', dtype = int) data = normalizer.normalize(data) X.append(data) y.append(LABELS[label])
expected_return = expected_annual_return / (365.0 / days_owned) #Percent price -= upcoming_dividend broker_cut = (7.95 + 0.75 * contracts_purchased) / (contracts_purchased * 100) if contracts_purchased == 10: # A hacky way for representing that "10" contracts purchased is just my way of dividing everything by 10 broker_cut = 0.087 / 10 beta = 1 end_price_list = np.random.gamma(price, beta, 10000) # Normalize the list by looping through normalization methods to get (a) desired [expected] standard deviation and (b) desired [expected] average return total_range = 0 for i in range(1,10000,1): step = i / 10.0 temp_end_price_list = normalize(end_price_list, minimum = 0, total_range = step) std_dev_actual = (np.std(map(lambda x: x - price, temp_end_price_list))/price * 100) if abs(std_dev_actual - desired_stddev) < 0.1: #.1% print "Using total range of %.2f, actual std dev is %.2f, desired std dev is %.2f" % (step, std_dev_actual, desired_stddev) end_price_list = temp_end_price_list total_range = step break start_step = 1 if desired_stddev > 16: start_step = -10000 if desired_stddev > 12: start_step = -6000 elif desired_stddev > 9: start_step = -2000 for i in range(start_step,10000,1):
pluscomment = pluscomment.replace("※", "") pluscomment = pluscomment.replace("∴", "") pluscomment = pluscomment.replace("*", "") pluscomment = pluscomment.replace("+", "") pluscomment = pluscomment.replace("・", "") pluscomment = pluscomment.replace("°", "") pluscomment = pluscomment.replace("w", "") """ pluscomment = pluscomment.replace("null", "") pluscomment = pluscomment.replace("\n", "") pluscomment = pluscomment.replace("\t", "") pluscomment = pluscomment.replace(" ", "") pluscomment = pluscomment.replace(" ", "") pluscomment = pluscomment.replace("ぁ", "あ") pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment) pluscomment = normalize(pluscomment.decode("utf-8")) pluscomment = pluscomment.replace(u" ", "") pluscomment = pluscomment.replace(u" ", "") #さけび声対策 pluscomment = pluscomment.replace(u"ーー",u"ー") if pluscomment != '': pluscomment = tagger.parse(pluscomment.encode("utf-8")) #pluscomment = pluscomment.replace("\n"," ") pluscomment = pluscomment.replace(" "," ") fo.write(pluscomment) thread[ID][j]["comment"] = commenttext fo.write("\n") fo.close() files = os.listdir('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video') for nfile in files[1:2]:
tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/ruiter-keyword.dic, /usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/wikipedia-keyword.dic,/usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/hatena-keyword.dic') #commentfiles = os.listdir('comment') for j in thread.keys(): filename = ("businesstexts/" + str(j) + ".txt") fo = file(filename,'w') #filename = ("comment2_" + ID + "/" + "sm20158." +"txt") commenttext = '' text = thread[j]['bodyText'] if ".T)" in text: point = text.find(".T)") meigararmei = text[(point-14):(point + 3)] text = text.replace(meigararmei,"") sentence = text.replace("。"," ") if sentence != '': sentence = unicodedata.normalize("NFKC", "".join(unicodedata.normalize("NFKC", sentence.decode("utf-8")).split())) sentence = normalize(sentence) sentence = sentence.lower() sentence = re.sub(re.compile("[!-/:-@[-`{-~]"), '', sentence.encode("utf-8")) sentence = sentence.replace(" ", "") sentence = sentence.replace(" ", "") sentence = sentence.replace("、", "") sentence = tagger.parse(sentence) fo.write(sentence) else: continue fo.write("\n") fo.close() filename = ("allbuisinessnews.txt") fo = file(filename,'w')
def do_predict_single(data): current = normalizer.normalize(np.array(data)) result = model.predict(current)[0] result = LABEL_LIST[int(result)] return { result : 1 }