def loadData(topic): docName="" if ".txt" in topic: #docName="~/Dropbox/sandbox/KnowledgeTools/Docs/"+topic docName=topic else: # call bash script to get and fix wiki article cmd="bash loader.sh "+topic.replace(' ', '_') #subprocess.Popen(cmd) process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) output = process.communicate()[0] docName=DOCS_DIR+topic.replace(' ', '_')+".txt" # open document and parse into sentences and words data=[] with open (docName) as f: lines = f.readlines() for line in lines: words = line.split() for word in words: data.append(word) #print (word) # return word array for article return data
def extract_data(input_file): """ Extracts data from the input_file '../../timeline.htm' looks out <div class = "comment"> and extracts the following comment information Dumps the comments in to a file, checks whether file already exists. :param input_file: specify location of 'timeline.htm' (str) :param output_file: specify the output filename (str) :return: data """ #opens file and closes, stores the file in output print("Reading {}...".format(input_file)) t0 = time.time() with open(input_file,"r") as f: output = f.read() #Parse output string data using BeautifulSoup soup = BeautifulSoup(output, "lxml") extract_div = soup.find_all("div", {"class":"comment"}) #Extract lines and write to csv file data = [] for line in extract_div: data.append(line.text) t1 = time.time() print("It took {} ms to extract the comments".format((t1-t0)*1000)) return data
def create_dataframe(self): data = [] for dialogue in ifilter(lambda x: x.has_deal(), self.examples): for turn in dialogue.turns: for u in turn.iter_utterances(): row = { 'post_id': dialogue.post_id, 'chat_id': dialogue.chat_id, 'scenario_id': dialogue.scenario_id, 'buyer_target': dialogue.buyer_target, 'listing_price': dialogue.listing_price, 'margin_seller': dialogue.margins['seller'], 'margin_buyer': dialogue.margins['buyer'], 'stage': u.stage, 'role': turn.role, 'num_tokens': u.num_tokens(), } for a in u.speech_acts: row['act_{}'.format(a[0].name)] = 1 for cat, word_count in u.categories.iteritems(): row['cat_{}'.format(cat)] = sum(word_count.values()) for q in dialogue.eval_questions: for r in ('buyer', 'seller'): key = 'eval_{question}_{role}'.format(question=q, role=r) try: row[key] = dialogue.eval_scores[r][q] except KeyError: row[key] = -1 data.append(row) df = pd.DataFrame(data).fillna(0) return df
def question_sorter(data): from IPython.core.display import display, HTML display(HTML("<style>.container { width:100% !important; }</style>")) question_list = list(question_dict.values()) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') clean = '\n-----\n'.join(tokenizer.tokenize(data)) split_list = clean.split('\n-----\n') matches1 = {} index_list = [] for key in question_dict.keys(): matches2 = {} i = 0 for sentence in split_list: x = fuzz.ratio(question_dict[key], sentence) matches2[i] = x i += 1 calc = max(matches2, key=matches2.get) maximum = [calc, matches2[calc]] start_index = max(matches2, key=matches2.get) + 1 maximum.append(start_index) matches1[key] = maximum matches3 = {} for key, value in matches1.items(): if value[1] >= 80: matches3[key] = matches1[key] index_list.append(matches1[key][0]) else: pass index_list.pop(0) i2 = 0 for key, value in matches3.items(): try: value.append(index_list[i2] - 1) i2 += 1 except IndexError: value.append(len(split_list) - 1) responses = {} for key, value in matches3.items(): i = value[2] data = [] while i <= value[3]: data.append(split_list[i]) i += 1 responses[key] = data return data
def process_tarfile_question_titles(tar): data = [] for xml_text, filename in iter_xmls(tar): root = ET.fromstring(xml_text) title_node = root.find("front/article-meta/title-group/article-title") if title_node is None or title_node.text is None: logging.warning("No title: %s" % filename) continue title = title_node.text if title is None or title[-1] != "?": continue abstract_node = root.find("**/abstract") if abstract_node is None: logging.warning("No abstract: %s" % filename) continue abstract_xml = ET.tostring(abstract_node).decode("utf-8") data.append({ "id": filename, "title": title, "abstract_xml": abstract_xml, }) print("Done processing tarfile %s. %d Questions added." % (tar, len(data))) return data
def load_title(filename): data = [] with open(filename) as fh: reader = csv.DictReader(fh) raw_data = list(reader) for row in raw_data: title = unicode(row['Headline'], errors='ignore').decode('utf-8').strip() clean_title = clean(title) clean_title = get_tokenized_lemmas(clean_title) id = row['Body ID'] # ignore the stance if there is any data.append((clean_title, id)) return data reader = unicode_csv_reader(open(filename)) for row in reader: title = row[0] clean_title = clean(title) clean_title = get_tokenized_lemmas(clean_title) id = row[1] # ignore the stance if there is any data.append((clean_title, id)) return data
def loadData(topic): docName = "" if ".txt" in topic: #docName="~/Dropbox/sandbox/KnowledgeTools/Docs/"+topic docName = topic else: # call bash script to get and fix wiki article cmd = "bash loader.sh " + topic.replace(' ', '_') #subprocess.Popen(cmd) process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) output = process.communicate()[0] docName = DOCS_DIR + topic.replace(' ', '_') + ".txt" # open document and parse into sentences and words data = [] with open(docName) as f: lines = f.readlines() for line in lines: words = line.split() for word in words: data.append(word) #print (word) # return word array for article return data
def contextualize(data_set): data = [] context = [] for line in data_set: data.append(tuple(sentence2sequence(line)[0])) context.append(tuple(sentence2sequence(line)[1])) return data, context
def write_bert_tokenized_json_classification(filename, sentences, labels): data = [] sentence_encodings = bert_tokenizer(sentences, return_offsets_mapping=False, padding=False, truncation=True) for i, label in enumerate(labels): token_id = sentence_encodings[i].ids type_id = sentence_encodings[i].type_ids data.append({ 'uid': str(i), 'label': label, 'token_id': token_id, 'type_id': type_id }) # Write the JSON dataset to ./ensemble_modeling directory with open(f'./ensemble_modeling/multi_task_learning/{filename}.json', 'w') as json_file: for line in data: json.dump(line, json_file) json_file.write('\n') # Write the JSON dataset to mt-dnn canonical data directory with open(f'../mt-dnn/canonical_data/{filename}.json', 'w') as json_file: for line in data: json.dump(line, json_file) json_file.write('\n')
def get_json(self): self.__get_files() data = [] for topic in self.files: for file in self.files[topic]: content = FileReader(filePath=file).content() data.append({'category': topic, 'content': content}) return data
def load_stance(filename): reader = unicode_csv_reader(open(filename)) data = [] for title, id, stance in reader: clean_title = clean(title) clean_title = get_tokenized_lemmas(clean_title) data.append((clean_title, id, stance.strip())) return data
def write_bert_tokenized_json_ner(filename, texts, spans): data = [] offset_mappings = [] texts = [text for text in texts] spans = [span for span in spans] text_encodings = bert_tokenizer(texts, return_offsets_mapping=True, padding=False, truncation=True) labels = [ preserve_labels(text_encodings[i], span) for i, span in enumerate(spans) ] for i, label in enumerate(labels): # update the CLS and SEP label ids label[0], label[-1] = 2, 3 # retrieve the token ids token_id = text_encodings[i].ids # retrieve the type ids type_id = text_encodings[i].type_ids # add tokenized post to data data.append({ 'uid': i, 'label': label, 'token_id': token_id, 'type_id': type_id }) # save the offsets mapping for computing scores later offset_mappings.append(text_encodings[i].offsets) # Write the JSON dataset to ./ensemble_modeling directory with open(f'./ensemble_modeling/multi_task_learning/{filename}.json', 'w') as json_file: for line in data: json.dump(line, json_file) json_file.write('\n') # Write the JSON dataset to mt-dnn canonical_data directory with open(f'../mt-dnn/canonical_data/{filename}.json', 'w') as json_file: for line in data: json.dump(line, json_file) json_file.write('\n') # Write the token offset mappings with open( f'./ensemble_modeling/multi_task_learning/{filename}_offsets.txt', 'w') as json_file: for line in offset_mappings: json.dump(line, json_file) json_file.write('\n') # Write the gold span labels with open(f'./ensemble_modeling/multi_task_learning/{filename}_spans.txt', 'w') as json_file: for span in spans: json.dump(span, json_file) json_file.write('\n')
def join_data(tokens,tags,lem): data = [] for i in range(len(tokens)): dados = [] dados.append(tokens[i]) dados.append(tags[i]) dados.append(lem[i]) data.append(dados) return data
def __load_csv(self, filename): data = [] with open(filename, 'rt') as csvfile: readr = csv.reader(csvfile, delimiter=' ', quotechar='|') for row in readr: if len(row) > 0: data.append(row[0]) return data
def get_url_predictions(self, url, ignore_pos=True): html = urlopen(url) soup = BeautifulSoup(html.read()) data = [] for string in soup.strings: string = " ".join(re.split("[^a-zA-Z.,!?]*", string.lower())).strip() data.append(string) return self._text_predictions(data, ignore_pos=ignore_pos)
def read_data(filename): #issue = {} data = [] with open(filename, 'r') as f: reader = csv.reader(f) next(reader) for row in reader: #issue[row[0]] = row data.append(row) return np.array(data)
def load_stance(filename): # reader = unicode_csv_reader(open(filename)) reader = unicode_csv_reader(open( filename, errors='ignore')) # NOTE: Changed for python3. data = [] for title, id, stance in reader: clean_title = clean(title) clean_title = get_tokenized_lemmas(clean_title) data.append((clean_title, id, stance.strip())) return data
def analyze_content(_str): print 'warning, deprecated' _str = str(_str) html = urlopen(_str) soup = BeautifulSoup(html.read()) data = [] for string in soup.strings: string = " ".join(re.split("[^a-zA-Z.,!?]*", string.lower())).strip() data.append(string) return get_tweets_predictions(data).tolist()
def append_result(text, label): """ appends text and label to objectivity.json """ a_dict = {'text': text, 'label': label} with open('objectivity.json') as f: data = json.load(f) data.append(a_dict) with open('objectivity.json', 'w') as f: json.dump(data, f)
def loadFile(fileName): firstFile = "" with open(fileName, 'r') as cFile: firstFile = cFile.read() firstFile = firstFile.split('\n') data = [] for line in firstFile: data.append(line.split(',')) return data
def read_data(f): data = [] for row in csv.reader(open(f), delimiter=';'): if row: plottext = row[8].decode('utf-8-sig') target = row[4] data.append((plottext, target)) (X, Ycat) = zip(*data) le = preprocessing.LabelEncoder() Y = le.fit_transform(Ycat) global labels labels = le.inverse_transform([0,1,2,3,4]) return (X, Y)
def read_data(file_name=''): data = [] print('Reading data start', datetime.now()) # file_name = 'Electronics_5.json' # 13 secs loading time # file_name = 'Digital_Music_5.json' # 1 sec loading time f = open(file_name, 'r') for line in f.readlines(): tmp = json.loads(line) data.append([tmp['reviewText'], tmp['overall']]) f.close() print('Reading data finsh', datetime.now()) return data
def load_data(directory): data = list() files = list() for name in tqdm(os.listdir(directory)): try: filename = directory + '/' + name datum = load_doc(filename) document, summary = split_doc(datum) data.append({'document':document, 'summary':summary}) files.append(filename) except UnicodeDecodeError: print(name) return data, files
def json_encode(dic): """ dictionary in json object out """ print "TEST: json_encode call" data = [] data.append(dic) data_string = json.dumps(data) print "TEST: json encoded ", data_string return data_string
def load_title(filename): data = [] with open(filename, errors='ignore') as fh: reader = csv.DictReader(fh) raw_data = list(reader) for row in raw_data: title = str(row['Headline']).strip() clean_title = clean(title) clean_title = get_tokenized_lemmas(clean_title) id = row['Body ID'] # ignore the stance if there is any data.append((clean_title, id)) return data
def generate_dataset(corpus): output = [] for line in corpus: token_list = line for i in range(1, len(token_list)): data = [] x_ngram = '<start> '+ token_list[:i+1] + ' <end>' y_ngram = '<start> '+ token_list[i+1:] + ' <end>' data.append(x_ngram) data.append(y_ngram) output.append(data) print("Dataset prepared with prefix and suffixes for teacher forcing technique") dummy_df = pd.DataFrame(output, columns=['input','output']) return output, dummy_df
def get_json(self): self.__get_files() data = [] for topic in self.files: rand = randint(100, 150) i = 0 for file in self.files[topic]: content = FileReader(filePath=file).content() data.append({'category': topic, 'content': content}) if i == rand: break else: i += 1 return data
def read_data_eval(f): data = [] for row in csv.reader(open(f), delimiter=';'): if row: plottext = row[8].decode('utf-8-sig') genre = {'genre': row[3]} year = row[2] target = row[4] data.append((plottext, genre, target, year)) (X, genre, Ycat, year) = zip(*data) year = np.array(year, dtype='float') year = yearscale.transform(year).reshape((-1, 1)) Y = le.transform(Ycat) genre = dv.transform(genre) return (X, Y, genre, year)
def read_data_eval(f): data = [] for row in csv.reader(open(f), delimiter=';'): if row: plottext = row[8].decode('utf-8-sig') genre = { 'genre': row[3] } year = row[2] target = row[4] data.append((plottext,genre,target,year)) (X, genre, Ycat,year) = zip(*data) year = np.array(year, dtype='float') year = yearscale.transform(year).reshape((-1,1)) Y = le.transform(Ycat) genre = dv.transform(genre) return (X, Y, genre, year)
def read_corpus(file_path, source): """ Read file, where each sentence is dilineated by a `\n`. @param file_path (str): path to file containing corpus @param source (str): "tgt" or "src" indicating whether text is of the source language or target language """ data = [] for line in open(file_path): sent = nltk.word_tokenize(line) # only append <s> and </s> to the target sentence if source == 'tgt': sent = ['<s>'] + sent + ['</s>'] data.append(sent) return data
def addFileToData(filename, data): intColumns = ['No. Reader', 'No. Helpful', 'Cleanliness','Check in / front desk', 'Value', 'Overall', 'Service', 'Business service', 'Rooms', 'Location'] characterThreshold = 60 with open(filename, 'r') as content_file: content = content_file.read() #print(repr(content)) if content.count("\r") > 0: reviews = content.split("\r\n\r\n") else: reviews = content.split("\n\n") for r in reviews: thisReview = pd.Series([None]*len(cats), cats) splt = r.split("\n") for s in splt: for c in cats: if "<"+c+">" in s: value = s.replace('<'+c+'>', '') if c in intColumns: value = int(value) if value == -1: #we dont want -1 as this is going to mess up averaging, take np.nan value = np.nan if c == "Content": value = remove_non_ascii(value.lower()) thisReview[c] = value if not thisReview["Content"] == None and len(thisReview["Content"]) > characterThreshold: #only add if theres content and its long enough data = data.append(thisReview, ignore_index=True) return data
def json_file(data_string, filename, fcnname ): """ jsonEncoded data_string in, filename, and function name in json file out naming convention for json files: "filename(including ext).functionname.json" """ print "TEST: json_file call" data = [] data.append(data_string) f = open((str(filename) + '.' + str(fcnname) + ".json"), "w+") json.dump(data, f, separators=('},{',', ')) f.flush() print "TEST: successful json dump and flush" return data_string
def run(self): t0 = time() params = self.params numX = len(params.Xinds) data = [] Pw_zs = [] #### for i in range(numX): data.append(params.ws[i]*self.Xs[params.Xinds[i]]) Pw_zs.append(self.initsDescriptor.Pw_zs[params.Xinds[i]]) #### self.initsDescriptor.Pw_zs = Pw_zs #### sys.stderr.write('PLSABet run at node '+str(params.eventID)+'...\n') self.descriptor = pLSABet(data,self.initsDescriptor,params.lambdaB, params.selectTime,self.DT,params.wt,params.Learn,params.debug) # if self.descriptor is None: sys.stderr.write("############### Pw_zs = None. Reduce K!!! \n") exit(-1) print( "pLSA done in "+str(time() - t0))
def loadDoc(topic): docName="" if "." in topic: #docName="~/Dropbox/sandbox/KnowledgeTools/Docs/"+topic docName=topic else: # call bash script to get and fix wiki article cmd="bash "+CODE_DIR+"loader.sh "+'"'+topic.replace(' ', '_')+'"' #cmd="bash loader.sh \""+topic.replace+"\"" #print "CMD =", cmd #subprocess.Popen(cmd) #process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) #output = process.communicate()[0] subprocess.call(cmd, shell=True) docName=DOCS_DIR+topic.lower().replace(' ', '_')+".txt" tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') data=[] with open(docName) as f: while True: c = f.read(1) if not c: break if ord(c) < 32 or ord(c) > 126: data.append(' ') else: data.append(c) data = ''.join(data) parsedData = tokenizer.tokenize(data) newParsedData = [sentence.split() for sentence in parsedData] return newParsedData
def load_file(filePath): fname, ext = os.path.splitext(filePath) dictionary = {} data = [] with open(filePath) as data_file: for line in data_file: data.append(json.loads(line)) for d1 in data: bid = d1.get('business_id') review = d1.get('text') rid = d1.get('review_id') dict_temp = {bid: review} dictionary[rid] = dict_temp #print (dictionary) return dictionary
def save_to_elasticsearch(es: elasticsearch.Elasticsearch, bucket: str, key: str, sentences: list): """ :param es: :param bucket: :param key: :param sentences: :return: """ parse_date = datetime.datetime.now() text_source_doc_id = bucket + "|" + key cik, form_type, as_of_date, company_name, edgar_file = key.split('|') cik = int(cik) as_of_date = datetime.datetime.strptime(as_of_date, '%Y%m%d').date() line_count = len(sentences) data = [] text_source_action = { "cik": cik, "form_type": form_type, "as_of_date": as_of_date, "line_count": line_count, "parse_date": parse_date, "parser_version": __version__ } res = es.index(index="text_source", body=text_source_action) text_source_id = res['_id'] for line_number, content in enumerate(sentences, 1): line_action = { "_index": "text_line", "text_source_id": text_source_id, "content": content, "line_number": line_number, "as_of_date": as_of_date, "cik": cik, "form_type": form_type } data.append(line_action) _logger.info("Saving to elasticsearch: {text_source_doc_id}".format( text_source_doc_id=text_source_doc_id)) helpers.bulk(es, data)
def read_data(f): data = [] for row in csv.reader(open(f), delimiter=';'): if row: plottext = row[8].decode('utf-8-sig') genre = { 'genre': row[3] } year = row[2] target = row[4] data.append((plottext,genre,target,year)) (X, genre, Ycat,year) = zip(*data) year = np.array(year, dtype='float') global yearscale yearscale = preprocessing.StandardScaler() year = yearscale.fit_transform(year).reshape((-1,1)) global le le = preprocessing.LabelEncoder() Y = le.fit_transform(Ycat) global labels labels = le.inverse_transform([0,1,2,3,4]) global dv dv = DictVectorizer(sparse=False) genre = dv.fit_transform(genre) return (X, Y, genre, year)
def jsonEncode(dic): # dictionary in json object out data = [] data.append(dic) data_string = json.dumps(data, separators = (',',':')) print "JSON ENCODED: ", data_string return data_string
def jsonToFile(data_string, name): # dictionary in json file out "jsondata" data = [] data.append(data_string) f = open((name + ".json"), "w+") json.dump(data, f, separators = (',',':')) f.flush()