def data_check(file_name): ''' This piece of code performs the following operations: 1. Iterates through each datafiles and checks if new data was uploaded or not. There are situations when the webpage reject the requests made by the scrape, which leads to missin data if not tracked correctly. 2. Sends notifications to slack ''' with open( '/Users/nikhilsawal/OneDrive/investment_portfolio/datafiles/{}'. format(file_name), 'rb') as inputfile: now = datetime.now() now = now.strftime('%Y-%m-%d %H') all_dates = [] for item in json_lines.reader(inputfile): date_val = datetime.strptime(item['datetime'], '%Y-%m-%d %H:%M:%S') all_dates.append(date_val.strftime('%Y-%m-%d %H')) if now in all_dates: unicode = "\u2705" status = "Success" description = 'Data added to {}'.format(file_name) else: unicode = "\u274C" status = "Fail" description = 'Data NOT added to {}'.format(file_name) # Send Slack notifications hf.slack_msg(""" ``` datafile: {}, status: {}, description: {} ``` """.format(unicode + file_name, status, description))
def get_debug_for_cluster(repo, graph, cluster_uri): did = repo if graph: did = repo + '-' + re.sub('[^0-9a-zA-Z]+', '-', graph) # get debug file for repo/graph if hasn't been loaded if did not in debugs: debugs[did] = [] debug_file = 'debug/' + did + '.jl' if os.path.isfile(debug_file): with open(debug_file, 'r') as f: for line in json_lines.reader(f): debugs[did].append(line) else: return None entity_uri = cluster_uri.replace('-cluster', '') for debug in debugs[did]: if entity_uri in debug['all_records']: return debug return None # not found
def _load_annotationsQA_R(annotations_jsonpath, split): """ Build an index out of FOIL annotations, mapping each image ID with its corresponding captions. """ entries = [] with open(annotations_jsonpath, 'rb') as f: for annotation in json_lines.reader(f): if split == 'test': for answer in annotation["answer_choices"]: question = annotation["question"] + ["[MARK]"] + answer img_id = _converId(annotation["img_id"]) ans_label = 0 anno_id = int(annotation["annot_id"].split('-')[1]) entries.append({ "question": question, "answers": annotation["rationale_choices"], "metadata_fn": annotation["metadata_fn"], "target": ans_label, "img_id": img_id, "anno_id": anno_id, "det_names": annotation['objects'] }) else: det_names = "" question = annotation["question"] + ["[MARK]"] + \ annotation["answer_choices"][annotation['answer_label']] ans_label = annotation["rationale_label"] img_id = _converId(annotation["img_id"]) anno_id = int(annotation["annot_id"].split('-')[1]) entries.append({ "question": question, "answers": annotation["rationale_choices"], "metadata_fn": annotation["metadata_fn"], "target": ans_label, "img_id": img_id, "anno_id": anno_id, "det_names": annotation['objects'] }) return entries
def get_multinli(data_path: str, prefix: str, suffix: str, dataset: str, genres: list = None) -> dict: path = os.path.join(data_path, prefix + dataset + suffix) labels = {'entailment': 0, 'neutral': 1, 'contradiction': 2} with open(path) as f: data = [item for item in json_lines.reader(f)] s1, s2, label = [], [], [] for entry in data: if genres is None or entry['genre'] in genres: if entry['gold_label'] in labels: s1.append(entry['sentence1']) s2.append(entry['sentence2']) label.append(labels[entry['gold_label']]) return {'s1': s1, 's2': s2, 'label': label}
def indexing(): #Reading the jsonl file with open('sample-1M.jsonl', 'rb') as f: i = 1 #Indexing 10000 values for item in json_lines.reader(f): if (i >= 5000): continue else: #Converting string formatted data to json file format item = json.dumps(item) #Reading the file in order to load it to the elasticsearch decoded = json.loads(item) #Adding to the index es.index(index='news_article', doc_type='articles', id=i, body=decoded) i += 1
def parse_json(input_file, output_file): with open(output_file, encoding='utf8', mode='w', newline='') as features_file: features_writer = csv.writer(features_file, delimiter=',', quotechar='', quoting=csv.QUOTE_NONE) features_writer.writerow(['Tweet_ID', 'Crowd_Label']) with open(input_file, 'rb') as f: for tweet_result in json_lines.reader(f): # Credible tweets if tweet_result['results']['sentiment']['agg'] == 'definetly_credible' \ or tweet_result['results']['sentiment']['agg'] == 'seems_credible': features_writer.writerow( [tweet_result['data']['tweet_id'], 1]) # Not credible tweets elif tweet_result['results']['sentiment'][ 'agg'] == 'definitely_not_credible': features_writer.writerow( [tweet_result['data']['tweet_id'], 0]) # Skipping cannot decide and none else: continue
def loadJsonlData(file: str) -> list: ''' Reads the data as saved in a .jsonl file Args: ---- file: String corresponding to the path to a .jsonl file which contains the tweets as received from the TwitterAPI. Returns: ------- tweets: A list of all the data saved in the .jsonl file. ''' tweets = [] with open(file, 'rb') as f: for tweet in json_lines.reader(f, broken=True): try: tweets.append(tweet) except json_lines.UnicodeDecodeError or json.JSONDecodeError: pass return tweets
def index_ngram(): print('Indexing ngram...') schema = Schema(id=ID(stored=True), question=NGRAM(minsize=2, maxsize=7), answer=NGRAM(minsize=2, maxsize=7)) if not os.path.exists('index_ngram'): os.mkdir('index_ngram') ix = create_in('index_ngram', schema) writer = ix.writer() with open(PATH_QUESTION_ANSWER, 'r') as f: for qa in json_lines.reader(f): # print(qa['question']) # print(qa['answer']) # print('\n') if not convenion.is_valid_qa(qa): continue question = convenion.customize_and_remove_stopword(qa['question']) answer = convenion.customize_and_remove_stopword(qa['answer']) writer.add_document(id=qa['id_cmt'], question=question, answer=answer) print('Commit ngram...') writer.commit()
def preprocess_ARC(): if getpass.getuser() == 'Mitch': # directory on my computer head = 'C:/Users/Mitch/PycharmProjects' else: # directory on compute head = '/home/kinne174/private/PythonProjects' difficulties = ['Easy', 'Challenge'] partitions = ['Train', 'Dev', 'Test'] # all_filenames = ['ARC-' + '-'.join(dp) + '.jsonl' for dp in product(difficulties, partitions)] for d in difficulties: for p in partitions: output = [] ARC_filename = 'ARC/ARC-V1-Feb2018-2/ARC-{}/ARC-{}-{}.jsonl'.format(d, d, p) dataset_filename = os.path.join(head, ARC_filename) if os.path.exists(dataset_filename): with open(dataset_filename, 'r', encoding='utf-8') as df: data = {} for ind, item in enumerate(json_lines.reader(df)): data['id'] = item['id'] data['question'] = item['question']['stem'] data['choices_text'] = [choice['text'] for choice in item['question']['choices']] data['choices_labels'] = [choice['label'] for choice in item['question']['choices']] data['answer'] = item['answerKey'] output += [data] else: raise Exception("Filename {} does not exist!".format(dataset_filename)) with open(os.path.join(head, 'hf_transformers/data/{}-{}.json'.format(d, p)), 'w') as of: json.dump(output, of)
def preprocess_jsonl(self, input_file_path, max_token_num): """ handles reading data from input jsonl file and writing preprocessed data into a separate file Preprocessed data is in the json format: np array of {"sentence1":..., "sentence2":..., "gold_label": 1} semi sorted 1. Extracting setences and gold label from jsonl file, removing instances with label "-" for gold label from the dataset 2. Prepending each sentence with the NULL token 3. Adding padding to the sentences to the maximum length to 20 words Args: input_file_path: path to file where the input jsonl is max_token_num: the number of tokens of the sentences we are adding padding to Returns: np array of new dictionaries {"sentence1":..., "sentence2":..., "gold_label": [1, 0, 0]} semi sorted """ data_list = [] with open(input_file_path, 'rb') as input_file: # opening file in binary(rb) mode for item in json_lines.reader(input_file): if item["gold_label"] != "-": # Removing unlabeled data new_item = {} # Prepending sentences with the NULL token token_array1 = ('\0 ' + item["sentence1"]).split() token_array2 = ('\0 ' + item["sentence2"]).split() if len(token_array1) <= 20 and len(token_array2) <= 20: new_item["sentence1"] = self.pad_sentence( token_array1, max_token_num) new_item["sentence2"] = self.pad_sentence( token_array2, max_token_num) new_item["gold_label"] = self.GOLD_LABELS[item[ "gold_label"]] # Converting gold label to vector representation data_list.append(new_item) random.shuffle(data_list) return np.array(data_list)
def parse(self, response): doc = response.css('body') with open('C:\\Users\\Ron\\git\\docSpider\\nnames.jl', 'rb') as f: for item in json_lines.reader(f): if response.url == item['url']: names = item['result']['PERSON'] for name in names: print(name) yield { 'name': name, 'text': (doc.xpath('//*[contains(text(), "' + name + '")]/../../*/*/text()').extract()), 'url': response.url } # next_page = response.xpath('.//a[contains(@class, "header")]/@href').extract_first() # if next_page is not None: # next_page = response.urljoin(next_page) # yield scrapy.Request(next_page, callback=self.parse)
def hashtagnetwork(filename, giant_component=False): """Generate Hashtag Network from Twitter data collection. Parameters: filename: path to jsonl twitter object to transform giant_component (boolean): keep only largest weakly connected component Returns: igraph graph object: hashtag network where a link is created between i to j if i and j appear in the same tweet. """ edgelist = [] with open(filename, 'rb') as f: for tweet in json_lines.reader(f): if len(tweet["entities"]["hashtags"]) > 1: cohashtags = [] for element in tweet["entities"]["hashtags"]: hashtag = element["text"] cohashtags.append(hashtag) combs = list(combinations(cohashtags,2)) for element in combs: source = element[0] target = element[1] edgelist.append((source, target)) H = ig.Graph.DictList(edges=(dict(source=source, target=target, weight=1) for source, target in edgelist), vertices=None, directed=False) if giant_component == True: H = H.components().giant() H.es['weight'] = 1 #H = H.simplify(combine_edges=dict(weight="sum")) return(H)
def readData(file_name, word2vec=w2v, load_w2v=True): """ Read the data. Filter out those neutral data. For datum that has multiple labels, select the most one as its label. Input: (str) file_name Output: (list) premise-hypothesis pairs """ print "Reading data file %s..." % file_name ph_pairs = [] label_cnt = {'entailment': 0, 'contradiction': 0, 'neutral': 0} corpus_dict = Lang('en') with open(file_name, 'rb') as f: for item in jsonl.reader(f): p = normalizeString(item['sentence1']) h = normalizeString(item['sentence2']) l = getLabel(item['annotator_labels']) label_cnt[l] += 1 datum = phPair(p, h, l) if datum.label != 'neutral': ph_pairs.append(datum) corpus_dict.addSentence(p) corpus_dict.addSentence(h) print "Loading dataset completed !" print "Loading word2vec model..." glove = np.zeros((5,5)) if load_w2v: glove = load_pretrained_embedding(corpus_dict, word2vec) print "Loading word2vec done!" print "Courpus used %d words" % corpus_dict.n_words print "Data distributions: %s" % label_cnt return ph_pairs, corpus_dict, glove
import json_lines as jl import argparse parser = argparse.ArgumentParser() parser.add_argument('-f',nargs='*') args=parser.parse_args() for arg in args.f: with open(arg) as f: data= jl.reader(f) for line in data: txt= open('dic.txt','a') txt.write(line['vin'] + '\n')
from bs4 import BeautifulSoup import json_lines import json f = open("PsychologyToday_All.jl") counter = 0 lister = [] fp = open("ParsedPsychologyToday.jl", "w") for i in json_lines.reader(f): counter += 1 print counter dicto = {} #url for webpage dicto["url"] = i["url"] #Group title soup = BeautifulSoup(i["raw_content"], "lxml") if len(soup.find_all(attrs={"class": "groups-section"})) > 0: x = BeautifulSoup( str(soup.find_all(attrs={"class": "groups-section"})[0]), "lxml") x = BeautifulSoup(str(x.find_all(attrs={"class": "group-title"})[0]), "lxml") if x.h2 != None: dicto["group_name"] = x.h2.text elif x.h3 != None:
try: os.mkdir(options.opath + "/preprocessing_data/preprocess/" + urlprefixdomain + "/w2p/bitextorlang") except FileExistsError: pass try: os.mkdir(options.opath + "/preprocessing_data/preprocess/" + urlprefixdomain + "/w2p/bitextorlang/" + options.lang) except FileExistsError: pass outputpath = options.opath + "/preprocessing_data/preprocess/" + urlprefixdomain + "/w2p/bitextorlang/" + options.lang json_file = gzip.open(options.newsfile, "rb") with lzma.open(outputpath + "/url.xz", 'w') as urlfile, lzma.open( outputpath + "/plain_text.xz", 'w') as bodyfile, lzma.open(outputpath + "/date.xz", 'w') as datefile: for newspiece in json_lines.reader(json_file): body = base64.b64encode( str.encode(newspiece["headline"] + "\n" + newspiece["body"])) date = dateutil.parser.parse(newspiece["firstPublished"]) url = options.urlprefix + newspiece["id"] urlfile.write(str.encode(url + "\n")) datefile.write( str.encode("%04d%02d%02d\n" % (date.year, date.month, date.day))) bodyfile.write(str.encode(body.decode("utf-8") + "\n"))
def identifyClasses(pos_lemmaTagged_factFile, clusters_factFile): classifyVerbFileName = getFileNamePart(clusters_factFile, '.json') + "_classifyVerb.json" annotatedVerbFileName = getFileNamePart(clusters_factFile, '.json') + "_annotatedVerb.json" clustersDict = {} entityClusterDict = {} with open(clusters_factFile, 'r') as clusters_File: print("Reading Clusters...") clustersDict = json.load(clusters_File) pprint(clustersDict) for clusterNo in clustersDict: cluster_name = 'cluster_' + clusterNo for clusterItem in clustersDict[clusterNo]: entityClusterDict[clusterItem] = clusterNo #cluster_name #pprint(entityClusterDict) clusterVerbDict = {} posLemmaVerbDict = {} with open(pos_lemmaTagged_factFile, 'r') as pos_lemmaFile, open(clusters_factFile, 'r') as clusters_File: for item in json_lines.reader(pos_lemmaFile): itemKeys = item.keys() if item['isFact']: pos_nn = "" if 'POS_NN' in itemKeys: pos_nn = item['POS_NN'] pos_nnp = "" if 'POS_NNP' in itemKeys: pos_nnp = item['POS_NNP'] lemma_verb = "" if 'Lemma_Verb' in itemKeys: lemma_verb = item['Lemma_Verb'] pos_verb = "" if 'POS_Verb' in itemKeys: pos_verb = item['POS_Verb'] #print(pos_nnp, pos_verb, pos_nn, lemma_verb) posLemmaVerbDict[pos_verb] = lemma_verb clusterVerb = entityClusterDict.get(pos_verb, None) clusterNN = entityClusterDict.get(pos_nn, None) clusterNNP = entityClusterDict.get(pos_nnp, None) #print(clusterNNP, clusterVerb, clusterNN) if clusterVerb: relationTuple = clusterNNP + ':' + clusterNN if clusterVerb in clusterVerbDict: clusterVerbDict[clusterVerb].add(relationTuple) else: clusterVerbDict[clusterVerb] = set([relationTuple]) print("Cluster Mapping Relation") pprint(clusterVerbDict) print() resultDict = {} for clusterVerbId in clusterVerbDict: #print(clusterVerbId) verbDetails = {} verbName = 'verb_' + clusterVerbId verbData = clustersDict[clusterVerbId] for mapping in clusterVerbDict[clusterVerbId]: clustersMapped = mapping.split(':') setA = 'set_A' setA_Index = clustersMapped[0] setA_Data = clustersDict[setA_Index] setB = 'set_B' setB_Index = clustersMapped[1] setB_Data = clustersDict[setB_Index] verbDetails['data'] = verbData verbDetails['lemma_data'] = getLemmaVerbData( posLemmaVerbDict, verbData) verbDetails[setA] = setA_Data verbDetails[setB] = setB_Data resultDict[verbName] = verbDetails pprint(resultDict) resultFilePath = "verbMapping/" + classifyVerbFileName with open(resultFilePath, 'w') as outFile: json.dump(resultDict, outFile, indent=4) print("verbMapping results found in {}".format(resultFilePath)) annotateVerbs(resultDict, annotatedVerbFileName)
def constructor_graph(f): g = myGraph() actor_list = [] actor_name = set() movie_list = [] vertices = [] edges = [] items = {} total_edge = 0 for item in json_lines.reader(f): # def add_vertex_to_graph(self, name, age, gross, date, page): if ('actor_name' in item): # print(item['actor_name'], item['actor_age']) actor_list.append(item) actor_name.add(item['actor_name']) g.add_vertex_to_graph(item['actor_name'], item['actor_age'], None, None, item['page'], False) actor_detail = "Actor: " + item['actor_name'] + "\nAge: " + str( item['actor_age']) items[item['actor_name']] = len(vertices) vertices.append(actor_detail) else: movie_list.append(item) g.add_vertex_to_graph(item['name'], None, item['gross'], item['date'], item['page'], True) movie_detail = "Movie: " + item['name'] + "\n Total Gross: " + str( item['gross']) items[item['name']] = len(vertices) vertices.append(movie_detail) for m in movie_list: gross = m['gross'] movie_name = m['name'] i = 1 for actor in m['actors']: cur_name = actor[actor.rfind("/") + 1:].replace("_", " ") if cur_name not in actor_name: continue egde_weight = gross * (1 + i * 0.0001) i += 1 g.add_edge(cur_name, movie_name, egde_weight) total_edge += 1 edges.append((items[cur_name], items[movie_name])) global total_actor global total_movie total_actor = len(actor_list) total_movie = len(movie_list) print(total_edge) graph = Graph(vertex_attrs={"label": vertices}, edges=edges, directed=False) Graph.write_svg(graph, fname="graph_cache.svg", labels='label', colors="blue", vertex_size=3, edge_colors=["yellow"] * 1000, font_size="4") return g
import json_lines import tensorflow as tf import gensim import numpy as np import jsonlines import pickle from random import shuffle # data_file = "./resources/stub.jsonl" data_file = "./resources/all.jsonl" data = json_lines.reader(open(data_file)) vocab = dict() max_len = 0 max_ind = 0 record = None count = 0 tot = 0 for sample in data: sentence1 = sample.get("sentence1").strip(".").split() sentence2 = sample.get("sentence2").strip(".").split() tot = tot + len(sentence1) + len(sentence2) if len(sentence1) > max_len: max_len = len(sentence1) record = sentence1 if len(sentence2) > max_len: max_len = len(sentence2) record = sentence2
import gensim import numpy as np # Defining constants node_number = 32 batch_size = 1 embedding_dim = 300 class_num = 3 # Defining file names train_data_file_name = "./resources/snli_1.0_dev.jsonl" f = open(train_data_file_name) embedding_file_name = "./resources/temp.bin" # word_dict = gensim.models.KeyedVectors.load_word2vec_format(embedding_file_name, binary=True) file = json_lines.reader(f) content = [] for line in file: content.append(line) def read_data(lines: list): labels = [] inputs = [] for line in lines: sentence1 = [] sentence2 = [] label_text = line.get("gold_label") if label_text == '-': continue
# init db -> this means creating it from scratch if os.path.exists(master_kb_text_dir): os.remove(master_kb_text_dir) con = sq.connect(master_kb_text_dir) query = 'CREATE TABLE master_kb_text (idx real, authors text, document_nm text, paragraph text, url text, verse_references text)' con.execute(query) con.commit() # start process loop vectorised_master_kb = [] k, idx = 0, 0 for jsonl_kbs in jsonl_kbs_to_process: with open(jsonl_kbs, 'rb') as jsonl_kb_file: for json_line in json_lines.reader(jsonl_kb_file): # log k += 1 if k % 10 == 0: print(f"{datetime.datetime.now()} - {k}") if filter_jsonl(json_line): # if True: # (1) encode the context and save numpy array # (2) save the paragraph in sqllite # print("\n"+json_line['paragraph']+"\n") # 1. encode context vectorised_master_kb.append( model.predict(json_line['paragraph'],
def __init__(self, jsonl_path, mode=None): self.mode = mode self.raw = [] self.lst = [] self.refs = [] if mode == 'test': lst = json.load(open(jsonl_path, 'r')) for item in lst: context = item['context'] dialog = [] for utts in context: p = utts.find(':') dialog.append( ((utts[p - 1] == 'A') * 2 - 1, utts[p + 2:-1], 0)) if dialog[0][1][-1] == '>': dialog = dialog[1:] if len(dialog) == 0: continue responses = [] for resp in item['responses']: #if resp[0] == ')': # resp = resp[2:] responses.append(resp) spk = (item['speaker'] == 'A') * 2 - 1 dialog.append((spk, responses[0], 0)) responses = responses[1:] responses = [ ' '.join(WordPunctTokenizer().tokenize(resp)) for resp in responses ] if len(responses) == 0: continue self.raw.append(dialog) self.lst.append((len(self.raw) - 1, 0, len(dialog))) self.refs.append(responses) return from collections import Counter self.ct = Counter() self.topics = [] with open(jsonl_path, 'r') as f: for idx, item in enumerate(reader(f)): utts = item['utts'] self.topics.append(item['topic']) self.raw.append([(int(speaker == 'A') * 2 - 1, sentence, _) for speaker, sentence, _ in utts]) lst = [(idx, start, start + wnd_sz) for start in range(0, len(utts)-wnd_sz)] + \ [(idx, 0, end) for end in range(2, min(wnd_sz+1, len(utts)))] #for i, start, end in lst: # spk_lst = self.raw[idx][end-2][0] # spk_tgt = self.raw[idx][end-1][0] # # self.lst.append((i, start, end)) self.lst += lst self.refs = [['none']] * len(self.lst)
def __init__(self, file_name): self.file = json_lines.reader(open(file_name))
def retweetnetwork(filename, giant_component=False, privacy=False, aggregation=None, t=0, starttime=None, endtime=None): """Generate Retweet Network from Twitter data collection. Parameters: filename: path to jsonl twitter object to transform giant_component (boolean): keep only largest weakly connected component aggregation (str): aggregation method to use ('soft', 'hard', 'None') privacy: t (int): threshold for hard aggregation Returns: igraph graph object: retweet network where a link is created from i to j if i retweeted j. """ with open(filename, 'rb') as f: nodesdict = {} edgelist = [] d3graph = {"nodes": [], "links": []} for tweet in (json_lines.reader(f)): if 'retweeted_status' in tweet: time = tweet["created_at"] time = datetime.strptime(time,'%a %b %d %X %z %Y') time_date = time.date() if starttime <= time_date <= endtime: # retweeting node [source of retweet action] name = tweet["user"]["screen_name"] try: nodesdict[f"{name}"]["followers"] = tweet["user"]["followers_count"] nodesdict[f"{name}"]["friends"] = tweet["user"]["friends_count"] except KeyError: nodesdict[f"{name}"] = {} nodesdict[f"{name}"]["followers"] = tweet["user"]["followers_count"] nodesdict[f"{name}"]["friends"] = tweet["user"]["friends_count"] try: nodesdict[f"{name}"]["tweets"].append(tweet["id_str"]) except KeyError: nodesdict[f"{name}"]["tweets"] = [] nodesdict[f"{name}"]["tweets"].append(tweet["id_str"]) # retweeted node [target of retweet action] name = tweet['retweeted_status']["user"]["screen_name"] try: nodesdict[f"{name}"]["followers"] = tweet['retweeted_status']["user"]["followers_count"] nodesdict[f"{name}"]["friends"] = tweet['retweeted_status']["user"]["friends_count"] except KeyError: nodesdict[f"{name}"] = {} nodesdict[f"{name}"]["followers"] = tweet['retweeted_status']["user"]["followers_count"] nodesdict[f"{name}"]["friends"] = tweet['retweeted_status']["user"]["followers_count"] try: nodesdict[f"{name}"]["tweets"].append(tweet['retweeted_status']["id_str"]) except KeyError: nodesdict[f"{name}"]["tweets"] = [] nodesdict[f"{name}"]["tweets"].append(tweet['retweeted_status']["id_str"]) # links source = tweet["user"]["screen_name"] target = tweet['retweeted_status']['user']['screen_name'] tweetid = tweet["id_str"] time_str = time.isoformat(timespec='seconds') edgelist.append((source, target, tweetid, time_str)) #print("Importing to igraph...") # import to igraph G = ig.Graph.DictList(edges=(dict(source=source, target=target, tweet=tweetid,time=time, weight=1) for source, target, tweet, time in edgelist), vertices=None, directed=True) # add node metadata for v in G.vs: name = v['name'] v['followers'] = nodesdict[name]['followers'] v['friends'] = nodesdict[name]['friends'] v['tweets'] = list(set(nodesdict[name]['tweets'])) #print("Running giant component and aggregations...") # giant_component == False and aggregation == None if giant_component == False and aggregation == None: pass # giant_component == True and aggregation == None elif giant_component == True and aggregation == None: G = G.components(mode="weak").giant() # giant_component == False and aggregation == 'hard' elif giant_component == False and aggregation == 'hard': todel = [] for v in G.vs: if G.degree(v, mode="in") <= t: todel.append(v.index) #print("Deleting vertices") G.delete_vertices(todel) #G = G.components(mode="weak").giant() # giant_component == True and aggregation == 'hard' elif giant_component == True and aggregation == 'hard': todel = [] for v in G.vs: if G.degree(v, mode="in") <= t: todel.append(v.index) #print("Deleting vertices") G.delete_vertices(todel) G = G.components(mode="weak").giant() # giant_component == False and aggregation == 'soft' elif giant_component == False and aggregation == 'soft': #G = G.components(mode="weak").giant() todel = [] for v in G.vs: if G.degree(v, mode="in") == 0 and len(set(G.neighbors(v, mode="out"))) < 2: todel.append(v.index) #print("Deleting vertices") G.delete_vertices(todel) # giant_component == True and aggregation == 'soft' elif giant_component == True and aggregation == 'soft': G = G.components(mode="weak").giant() todel = [] for v in G.vs: if G.degree(v, mode="in") == 0 and len(set(G.neighbors(v, mode="out"))) < 2: todel.append(v.index) G.delete_vertices(todel) return G
def preprocess(): ontology = { 'domains': {}, 'intents': {}, 'binary_dialogue_act': [], 'state': {} } def process_dialog(ori_dialog, split, dialog_id): domain = ori_dialog['domain'] ontology['domains'][domain] = { 'description': "", 'slots': {} } dialog = { "dataset": dataset, "data_split": split, "dialogue_id": f'{dataset}_{dialog_id}', "original_id": ori_dialog['id'], "domains": [domain], } turns = [] # starts with system for utt_idx, utt in enumerate(ori_dialog['turns'][1:]): turn = { 'utt_idx': utt_idx, 'utterance': utt, 'dialogue_act': { 'categorical': [], 'non-categorical': [], 'binary': [], }, } if utt_idx % 2 == 0: turn['speaker'] = 'user' turn['state'] = {} turn['state_update'] = { 'categorical': [], 'non-categorical': [], } else: turn['speaker'] = 'system' turns.append(turn) if turns[-1]['speaker'] == 'system': turns.pop() dialog['turns'] = turns return dialog dialog_id = 0 data = [] with ZipFile(os.path.join(origin_data_dir, 'metalwoz-v1.zip')) as zipfile: for path in zipfile.namelist(): if path.startswith('dialogues'): for dialog in json_lines.reader(zipfile.open(path)): data.append(process_dialog(dialog, 'train', dialog_id)) dialog_id += 1 ZipFile(os.path.join(origin_data_dir, 'metalwoz-test-v1.zip')).extract('dstc8_metalwoz_heldout.zip') with ZipFile(os.path.join('dstc8_metalwoz_heldout.zip')) as zipfile: for path in zipfile.namelist(): if path.startswith('dialogues'): for dialog in json_lines.reader(zipfile.open(path)): data.append(process_dialog(dialog, 'test', dialog_id)) dialog_id += 1 os.remove('dstc8_metalwoz_heldout.zip') json.dump(ontology, open(os.path.join(self_dir, 'ontology.json'), 'w')) json.dump(data, open('data.json', 'w'), indent=4) ZipFile(os.path.join(self_dir, 'data.zip'), 'w', ZIP_DEFLATED).write('data.json') os.remove('data.json')
exit(0) output_file = str(sys.argv[1]) + '.jsonl' print('\n') # start from next line with jsonlines.open(output_file, mode='w') as writer: print(parameters.file_open_msg.format(output_file)) print('\n') # start from next line for file_name in list_files: print(parameters.add_content_msg.format(file_name)) total_count = 0 with open(file_name, 'rb') as file: for item in json_lines.reader(file): total_count = total_count + 1 print(parameters.file_len_msg.format(file_name, total_count)) progress = 0 with jsonlines.open(file_name) as reader: for item in reader: writer.write(item) progress = progress + 1 print(parameters.file_read_prog.format( progress, total_count), end='\r') reader.close() print('\n') # start from next line
def get_qa_info(difficulty, subset, special='', limit=0): limit_bool = True if np.bool(limit) else False if getpass.getuser() == 'Mitch': header = r'C:\Users\Mitch\PycharmProjects\ARC' else: header = '/home/kinne174/private/PythonProjects/ARC' if special == 'MOON': if getpass.getuser() == 'Mitch': MOON_filename = os.path.join(header, r'visualization\moon_questions.json') else: MOON_filename = '/home/kinne174/private/PythonProjects/JSM_2019/moon/moon_questions.json' MOON_allinfo = [] MOON_document = namedtuple( 'MOON_document', 'id question choices_text choices_labels answer') with open(MOON_filename, 'rb') as f: for dic in json_lines.reader(f): # only one dict for some reason for id, item in dic.items(): question = item['question'] choices_text = item['choices_text'] choices_labels = item['choices_labels'] answer = item['answer'] MOON_allinfo.append( MOON_document(id, question, choices_text, choices_labels, answer)) return MOON_allinfo if difficulty == 'EASY': if subset == 'TRAIN': EASY_TRAIN_filename = r'ARC-V1-Feb2018-2\ARC-Easy\ARC-Easy-Train.jsonl' EASY_TRAIN_allinfo = [] EASY_TRAIN_document = namedtuple( 'EASY_TRAIN_document', 'id question choices_text choices_labels answer') with open(os.path.join(header, EASY_TRAIN_filename), 'rb') as f: for item_no, item in enumerate(json_lines.reader(f)): id = item['id'] question = item['question']['stem'] choices_text = [ choice['text'] for choice in item['question']['choices'] ] choices_labels = [ choice['label'] for choice in item['question']['choices'] ] answer = item['answerKey'] EASY_TRAIN_allinfo.append( EASY_TRAIN_document(id, question, choices_text, choices_labels, answer)) if limit_bool and item_no > limit: break return EASY_TRAIN_allinfo elif subset == 'DEV': EASY_DEV_filename = r'ARC-V1-Feb2018-2\ARC-Easy\ARC-Easy-Dev.jsonl' EASY_DEV_allinfo = [] EASY_DEV_document = namedtuple( 'EASY_DEV_docuemnt', 'id question choices_text choices_labels answer') with open(os.path.join(header, EASY_DEV_filename), 'rb') as f: for item_no, item in enumerate(json_lines.reader(f)): id = item['id'] question = item['question']['stem'] choices_text = [ choice['text'] for choice in item['question']['choices'] ] choices_labels = [ choice['label'] for choice in item['question']['choices'] ] answer = item['answerKey'] EASY_DEV_allinfo.append( EASY_DEV_document(id, question, choices_text, choices_labels, answer)) if limit_bool and item_no > limit: break return EASY_DEV_allinfo else: EASY_TEST_filename = r'ARC-V1-Feb2018-2\ARC-Easy\ARC-Easy-Test.jsonl' EASY_TEST_allinfo = [] EASY_TEST_document = namedtuple( 'EASY_TEST_allinfo', 'id question choices_text choices_labels answer') with open(os.path.join(header, EASY_TEST_filename), 'rb') as f: for item_no, item in enumerate(json_lines.reader(f)): id = item['id'] question = item['question']['stem'] choices_text = [ choice['text'] for choice in item['question']['choices'] ] choices_labels = [ choice['label'] for choice in item['question']['choices'] ] answer = item['answerKey'] EASY_TEST_allinfo.append( EASY_TEST_document(id, question, choices_text, choices_labels, answer)) if limit_bool and item_no > limit: break return EASY_TEST_allinfo else: if subset == 'TRAIN': CHALLENGE_TRAIN_filename = r'ARC-V1-Feb2018-2\ARC-Challenge\ARC-Challenge-Train.jsonl' CHALLENGE_TRAIN_allinfo = [] CHALLENGE_TRAIN_document = namedtuple( 'CHALLENGE_TRAIN_allinfo', 'id question choices_text choices_labels answer') with open(os.path.join(header, CHALLENGE_TRAIN_filename), 'rb') as f: for item_no, item in enumerate(json_lines.reader(f)): id = item['id'] question = item['question']['stem'] choices_text = [ choice['text'] for choice in item['question']['choices'] ] choices_labels = [ choice['label'] for choice in item['question']['choices'] ] answer = item['answerKey'] CHALLENGE_TRAIN_allinfo.append( CHALLENGE_TRAIN_document(id, question, choices_text, choices_labels, answer)) if limit_bool and item_no > limit: break return CHALLENGE_TRAIN_allinfo elif subset == 'DEV': CHALLENGE_DEV_filename = r'ARC-V1-Feb2018-2\ARC-Challenge\ARC-Challenge-Dev.jsonl' CHALLENGE_DEV_allinfo = [] CHALLENGE_DEV_document = namedtuple( 'CHALLENGE_DEV_allinfo', 'id question choices_text choices_labels answer') with open(os.path.join(header, CHALLENGE_DEV_filename), 'rb') as f: for item_no, item in enumerate(json_lines.reader(f)): id = item['id'] question = item['question']['stem'] choices_text = [ choice['text'] for choice in item['question']['choices'] ] choices_labels = [ choice['label'] for choice in item['question']['choices'] ] answer = item['answerKey'] CHALLENGE_DEV_allinfo.append( CHALLENGE_DEV_document(id, question, choices_text, choices_labels, answer)) if limit_bool and item_no > limit: break return CHALLENGE_DEV_allinfo else: CHALLENGE_TEST_filename = r'ARC-V1-Feb2018-2\ARC-Challenge\ARC-Challenge-Test.jsonl' CHALLENGE_TEST_allinfo = [] CHALLENGE_TEST_document = namedtuple( 'CHALLENGE_TEST_allinfo', 'id question choices_text choices_labels answer') with open(os.path.join(header, CHALLENGE_TEST_filename), 'rb') as f: for item_no, item in enumerate(json_lines.reader(f)): id = item['id'] question = item['question']['stem'] choices_text = [ choice['text'] for choice in item['question']['choices'] ] choices_labels = [ choice['label'] for choice in item['question']['choices'] ] answer = item['answerKey'] CHALLENGE_TEST_allinfo.append( CHALLENGE_TEST_document(id, question, choices_text, choices_labels, answer)) if limit_bool and item_no > limit: break return CHALLENGE_TEST_allinfo
import json_lines import time with open('nguoiduatin_phapluat.json', 'rb') as f: for item in json_lines.reader(f): print(item['content']) time.sleep(5)
def read_jsonl_file(jsonl_fn): with open(jsonl_fn, 'rb') as f: for item in json_lines.reader(f): yield item
import pandas as pd import json_lines as jl from sys import argv import logging if len(argv) < 3: logging.critical( 'Not enough parameters passed. Run script as:\npython postprocess.py [input-file] [output-directory]' ) exit(1) parcel_items = [] building_items = [] with open('../scraper/parceldata.jl', 'rb') as f: for item in jl.reader(f): if not item: #pass over empty dictionary continue elif 'property_address' in item: pass elif 'use_code' in item: pass elif 'millage_rate' in item: pass elif 'owner' in item: pass
temp_word = f.read() temp_word = re.sub('##', '', temp_word) print(temp_word) vocab = temp_word.split() #print(vocab) # load the NER tagger tagger = SequenceTagger.load('ner') path = r'C:\Users\Luca\Desktop\Current Projects\DMT HW 3\DataSet' new_data = [] with open(path + '\paper_dev.jsonl', 'rb') as f: # opening file in binary(rb) mode for index, item in enumerate(json_lines.reader(f)): # discard all the label = Not enough info if item['label'] == 'NOT ENOUGH INFO': continue print('Index--->', index) print() # We are not interested in the evidence key del item['evidence'] # run NER over sentence sentence = Sentence(copy.deepcopy(item['claim'])) tagger.predict(sentence) # We are only interested in single labels for the whole claim sentence = sentence.to_dict(tag_type='ner') sentence['labels']