def persistence_load(db_path=config.WN_FEATURE_CACHE_PATH): p_dict = { 'hypernym_stems_dict': dict(), 'hyponym_stems_dict': dict(), 'hyper_lvl_dict': dict(), 'hypo_lvl_dict': dict(), 'ant_dict': dict(), 'em_lemmas_dict': dict(), } # if em_dict: # p_dict['em_dict'] = dict() for dict_name in p_dict.keys(): print("Loading Persistent WN Feature Dict:", dict_name) if dict_name != 'em_dict': in_db_dict = SqliteDict(str(db_path / dict_name), autocommit=False, tablename='the_table', flag='c') for key, v in tqdm(in_db_dict.items()): p_dict[dict_name][key] = v in_db_dict.close() elif dict_name == 'em_dict': in_db_dict = SqliteDict(str(db_path / dict_name), autocommit=False, tablename='the_table', flag='c') for key, v in tqdm(in_db_dict.items()): p_dict[dict_name][key] = v in_db_dict.close() return p_dict
def distribution_from_data_db(data_db: SqliteDict, vec_dim: int): sums = np.array([0] * vec_dim).astype(np.float32) squares = np.array([0] * vec_dim).astype(np.float32) for _, vec in data_db.items(): sums += vec means = sums / len(data_db) for _, vec in data_db.items(): squares += (vec - means)**2 variances = np.sqrt(squares / len(data_db)) weights = list(zip(means, variances)) return weights
def sqlite_to_array(num): p2 = SqliteDict(f"{args.src}perspective_value{num}.sqlite", tablename="value", flag="r") c = 0 t_ini = time() ids = [] perspective = [] for key, value in p2.items(): if c % 100000 == 0: print("iteration number ", c, "at", round((time() - t_ini) / 60, 2), "minutes") c += 1 if c % 5000000 == 0: save_arrays(num, perspective, ids, c) ids = [] perspective = [] ids.append(key) perspective.append(tuple(value.values())) c += 1 save_arrays(num, perspective, ids, c)
class CDataBase(object): def __init__(self): try: self.close() except: pass self.mydict = SqliteDict('./DB/my_db.sqlite', autocommit=True) self.show() def set(self, key, value): self.mydict[key] = value def get(self, key): if key in self.mydict.keys(): ret = self.mydict[key] else: ret = None return ret def show(self, start_with=''): for key, value in self.mydict.items(): if key.find(start_with): print(key, '\t', value, '\n') def clear(self): self.mydict.clear() def close(self): self.mydict.close()
def check_for_cached_result(query: str, thresh: float = 0.8, sim=cosine_sim) -> tuple: query_map = SqliteDict(query_map_path) query_results = SqliteDict(query_db_path) # see if we have computed this exact query before try: result = query_results[query] query_vec = query_map[query] query_map.close() query_results.close() return result, query_vec except KeyError: pass # If not, see if any query is close enough processed_query = process_query(query) max_sim_query, score = get_max_sim(processed_query, query_map.items(), sim=sim) if score > thresh: result = query_results[max_sim_query] query_map.close() query_results.close() return result query_map.close() query_results.close() return [], processed_query
def process_db(dbpath, options): output_count = 0 # No context manager (and no close()) as this is read-only and # close() can block for a long time for no apparent reason. db = SqliteDict(dbpath, flag='r', autocommit=False) for key, value in db.items(): root, ext = os.path.splitext(key) if ext != options.suffix: continue if options.random is not None and options.random < random(): continue if options.id_prefix is None: doc_id = root else: doc_id = options.id_prefix + root text = value.rstrip('\n').replace('\n', ' ').replace('\t', ' ') print('{}\t<AUTHORS>\t<JOURNAL>\t<YEAR>\t{}'.format(doc_id, text)) output_count += 1 if options.limit is not None and output_count >= options.limit: break return output_count
class TagsData(object): _tags_db_path = './src/server/tags.sqlite' def __init__(self, db_path=_tags_db_path): self._tags_db = SqliteDict(db_path, tablename='tags', autocommit=True) self._metadata = self.compute_metadata() def compute_metadata(self): res = [] for tag_user, data in self._tags_db.items(): tag, user = tag_user.split(':') num_tagged = len(data['pos']) + len(data['neg']) + len(data['unk']) res.append({'tag': tag, 'num': num_tagged, 'user': user}) res.sort(key=lambda item: item['num'], reverse=True) return res def get_metadata(self): return self._metadata def get_data(self, tag, user): tag_user = tag + ':' + user if tag_user in self._tags_db: return self._tags_db[tag_user] return {'pos': [], 'neg': [], 'unk': []} def save_data(self, tag, user, pos, neg, unk): tag_user = tag + ':' + user self._tags_db[tag_user] = {'pos': pos, 'neg': neg, 'unk': unk} # recompute metadata. # This could be done more efficiently with an incremental update self._metadata = self.compute_metadata()
def check(self, unique_pos_in='data/pos_unique.tab', unique_pos_out='data/pos_unique.db', unique_ner_in='data/ner_unique.tab', unique_ner_out='data/ner_unique.db'): """ init constants from data POS NER Keyword arguments: pos_in pos_out and ner_in ner_out pos_in line structure as [tag \t description ] : NOUN noun, singular or mass ner_in line structure as [type \t description ] : ORG Companies, agencies, institutions, etc. Raises: Exception: check your pos data path or check your ner data path """ if not os.path.exists(unique_pos_out): if not os.path.exists(unique_pos_in): raise Exception( 'check your pos unique path ({})'.format(unique_pos_in)) else: unique_pos = pd.read_csv(unique_pos_in, sep='\t') with SqliteDict(unique_pos_out, autocommit=True) as db: for p in unique_pos.iterrows(): db[p[1][0]] = p[1][1] self.POS = list(db.items()) else: db = SqliteDict(unique_pos_out) self.POS = list(db.items()) db.close() if not os.path.exists(unique_ner_out): if not os.path.exists(unique_ner_in): raise Exception( 'check your ner unique path ({})'.format(unique_ner_in)) else: unique_ner = pd.read_csv(unique_ner_in, sep='\t') with SqliteDict(unique_ner_out, autocommit=True) as db: for p in unique_ner.iterrows(): db[p[1][0]] = p[1][1] self.NER = list(db.items()) else: db = SqliteDict(unique_ner_out) self.NER = list(db.items()) db.close()
def next_batch_from_dataset(data_db: SqliteDict, batch_size=20000): qkeys = [] feature_vectors = [] for q, vec in data_db.items(): qkeys.append(q) feature_vectors.append(vec) if len(qkeys) == batch_size: yield qkeys, feature_vectors qkeys = [] feature_vectors = [] if len(qkeys) > 0: yield qkeys, feature_vectors
def gen_doc(self, is_pos=True): if is_pos: path = 'ui/public/pos.tab' db = SqliteDict(self.path_pos) pos = pd.DataFrame.from_dict(db.items()) db.close() pos.to_csv(path, sep='\t', index=None, encoding='utf8', header=None) return 'generate pos file in : {}'.format(path) else: path = 'ui/public/ner.tab' db = SqliteDict(self.path_ner) ner = pd.DataFrame.from_dict(db.items()) db.close() ner.to_csv(path, sep='\t', index=None, encoding='utf8', header=None) return 'generate ner file in : {}'.format(path)
def get_nearest(query_vec: BagOfWordsVector, k: int = 20, thresh: int = 0, sim=cosine_sim, return_all: bool = False) -> list: # Generate tuple list with entries in the form of (<doc_id>, <doc_vector>) db = SqliteDict(doc_vecs_db_path) doc_pairs = [(key, value.vector) for key, value in db.items()] db.close() if thresh != 0: results = search_by_threshold(query_vec, doc_pairs, thresh, sim=sim) else: results = search_by_knn(query_vec, doc_pairs, k, sim=sim, return_all=return_all) return results
def sqlite_to_array(num): emp_sql = SqliteDict(f"{args.src}empath_value{num}.sqlite", tablename="value", flag="r") t_ini = time() ids = [] emp_values = [] c = 0 for key, value in emp_sql.items(): if c % 1000000 == 0: print("iteration number ", c, "at", round((time()-t_ini)/60, 2), "minutes") c += 1 ids.append(key) emp_values.append(tuple(value.values())) if c % 10000000 == 0: save_arrays(emp_values, ids, c) ids = [] emp_values = [] save_arrays(num, emp_values, ids, c)
def main(): args = docopt(__doc__, help=True) input_file = args['<unsplash_sqlite_file>'] output_file = args['<output_sqlite_file>'] input_data = SqliteDict(input_file, tablename='images', flag='r') tag_data = defaultdict(list) for img, img_data in input_data.items(): if 'tags' in img_data and 'urls' in img_data: for tag in img_data['tags']: tag_data[tag] += [img] with SqliteDict(output_file, tablename='tags', autocommit=True) as output_data: output_data = {'unsplash': {}} for tag in tag_data: img_list = tag_data[tag] if len(img_list) >= MIN_IMAGE_THRESHOLD: output_data['unsplash'][tag] = { 'pos': img_list, 'neg': [], 'unk': [], }
tokens.remove('') tokens = list(tokens) return tokens X_test = [] y_test = [] ''' Build testing dataset ''' print('Building testing dataset') ctr_breaker = 0 start_time = time.time() for claimId, val in testing_db.items(): supportsOrRefutes = val[1] if ctr_breaker % 500 == 0: print(ctr_breaker) if ctr_breaker == 30000: break ctr_breaker += 1 if supportsOrRefutes != 'NOT ENOUGH INFO': claim = val[2] claimTokens = tokenise_line(claim)
from sqlitedict import SqliteDict import numpy as np linksDB = SqliteDict('phase2-links.sqlite', autocommit=True) metadataDB = SqliteDict('phase2-metadata.sqlite', autocommit=True) d = 0.1 pageRanks = {} pageRanksTemp = {} for page, metadata in metadataDB.items(): pageRanks[page] = 1 pageRanksTemp[page] = 1 numberIterations = 3 for iteration in range(numberIterations): for page, metadata in metadataDB.items(): score = 0 for childPage, metadata in metadataDB.items(): if int(page) in linksDB[childPage]['out']: score += pageRanks[childPage] / len(linksDB[childPage]['out']) pageRanksTemp[page] = d + (1 - d) * score pageRanks = pageRanksTemp for pageID, rank in pageRanks.items(): data = metadataDB[pageID] index = len(data) np.insert(data, index, rank)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--dnscache', default="dnscache.sqld", help='IP address cache default: %(default)s') parser.add_argument( '--download', default="pages.sqld", help='Here is where the downloaded pages go: %(default)s') parser.add_argument( '--r404', default="404.sqld", help='Here is where we remember pages that gave 404 etc: %(default)s') args = parser.parse_args() #2) Results setup result_store = SqliteDict(args.download, encode=gz_encode, decode=gz_decode, autocommit=True) for url, cont in result_store.items(): print(url, cont[:30]) #3) 404 setup r404 = SqliteDict(args.r404, autocommit=True) for url, status in r404.items(): print(url, status)
from sqlitedict import SqliteDict from datetime import datetime import pickle source = "./../data/sqlite/community_texts/" path = "./../data/sentiment/community_id/" names_list = ["right-center", "Alt-right", "center", "right", "left", "left-center", "IDW", "Alt-lite"] for name in names_list: community = SqliteDict(f"{source}{name}.sqlite", tablename="value", flag="r") ks = {} c = 0 for key, value in community.items(): if c % 1000000 == 0: print("Iteration number", c) c += 1 ks[key] = datetime.fromtimestamp(value["timestamp"]//1000).year with open(f'{path}{name}.pickle', 'wb') as handle: pickle.dump(ks, handle, protocol=pickle.HIGHEST_PROTOCOL) print("pickle")
class BackendDbHandler(object): """Table structure target_pages: A table to save URL where folklore is. Key-value pair. {url_string: TargetPage object} target_html: A table to save HTML of folklore. Key-value pair. {url_string: ExtractedPage object} """ def __init__(self, path_db_file: str, interval: int = 3): self.db_target_pages = SqliteDict(path_db_file, autocommit=True, tablename='target_pages', encode=json.dumps, decode=json.loads) self.db_html = SqliteDict(path_db_file, autocommit=True, tablename='target_html', encode=json.dumps, decode=json.loads) self.interval = interval def save_target_urls(self, target_urls: List[str]): """Save target URL into DB.""" for url in target_urls: if url not in self.db_target_pages: data, errs = TargetPage(strict=True).load({ 'page_url': url, 'status': False, 'note': '', 'extracted_at': '' }) self.db_target_pages[url] = data else: logger.info('URL={} is already in target. Skip.'.format(url)) else: self.db_target_pages.commit() def run_html_extraction(self, is_force_retry: bool = False, limit: int = -1): """Gets all target page and save them into DB.""" default_i = 0 for url, page_obj in tqdm(list(self.db_target_pages.items())): _obj = TargetPage(strict=True).load(page_obj) if page_obj['status'] is False or is_force_retry is True: try: html_doc = requests.get(url).text error_msg = '' status = True except ExtractedPage as e: html_doc = '' error_msg = e.__str__() status = False data, errs = ExtractedPage(strict=True).load({ 'page_url': url, 'status': status, 'html_document': html_doc, 'note': error_msg, 'extracted_at': datetime.now().__str__() }) page_obj['status'] = True page_obj['extracted_at'] = datetime.now().__str__() self.db_target_pages[url] = data default_i += 1 time.sleep(self.interval) if default_i == limit: logger.info('Terminated by limit={}'.format(limit)) break else: logger.info('URL={} is already in target. Skip.'.format(url)) else: self.db_target_pages.commit() self.db_html.commit() def show_extracted_html(self) -> List[Dict[str, Any]]: __ = [] for url, obj_ in self.db_target_pages.items(): data, errs = ExtractedPage(strict=True).load(obj_) if data['status']: __.append(obj_) else: return __
# Creates a sqlite for each category from sqlitedict import SqliteDict splits = [i*10000000 for i in range(0, 8)] source = './../data/sqlite/split_texts/' path = "./../data/sqlite/community_texts/" actual_category = "none" category_dict = SqliteDict(f"{path}AL.sqlite", tablename="value", journal_mode="OFF") text_dict = SqliteDict(f"{source}text_dict_{0}.sqlite", tablename="value", flag="r") c = 0 for num in splits: category_dict.commit() text_dict.close() text_dict = SqliteDict(f"{source}text_dict_{num}.sqlite", tablename="value", flag="r") print(num) for id_c, value in text_dict.items(): if value["category"] != actual_category: category_dict.commit() category_dict.close() category_dict = SqliteDict(f"{path}{value['category']}.sqlite", tablename="value", journal_mode="OFF") category_dict[id_c] = value category_dict.commit() category_dict.close()
journal_mode='OFF') page_title_token_positionsDB = SqliteDict( 'phase2-page_title_token_positions.sqlite', journal_mode='OFF') page_title_inverted_index = SqliteDict( 'phase2-page_title_inverted_index.sqlite', journal_mode='OFF') # cache heavily accessed dicts in main memory as python dicts pageID2tf = {} for key in metadataDB.keys(): pageID2tf[int(key)] = np.uint16(metadataDB[key][5]) # invertedIndex = dict((int(k),v) for k,v in invertedIndex.items()) # page_title_inverted_index = dict((int(k),v) for k,v in page_title_inverted_index.items()) # invert mappings id2token = dict((v, k) for k, v in token2id.items()) id2page = dict((v, k) for k, v in page2id.items()) @app.route('/') def startpage(): return render_template('startpage.html') @app.route('/result', methods=['POST', 'GET']) def result(): if request.method == 'POST': result = request.form for key, value in result.items(): query = value
'v1alpha1', developerKey=api_key) # Initiating the DataBases: dict_c = SqliteDict(args.src, tablename="text", flag="r") value_dict = SqliteDict(args.dst, tablename="value", journal_mode='OFF') # Initiating multi-process pool: workers = 500 # The number 20 was chosen because it best fit the # of requests/second p = Pool(workers, initializer=initialize_worker) time_iter = time() print("bla", args.init, args.end) to_request = [ (k, v["text"], args.dst) for k, v in itertools.islice(dict_c.items(), args.init, args.end) ] time_end = time() dif = (args.end - args.init) // args.loop print(f"Time to iter: {round((time_end - time_iter) / 60, 2)}") for i in range(args.loop): time_init = time() p.starmap(process_text, to_request[i * dif:(i + 1) * dif]) time_end = time() dt = time_end - time_init print(f"Time to run the {i} loop is {round(dt/60, 2)}") if i != args.loop - 1: sleep(100 - dt) # Running Perspective # add_perspective(to_request, dict_c, value_dict, p)
from sqlitedict import SqliteDict import pickle import itertools t = [] ct = 0 c = 0 mydict = SqliteDict( "/../../../../../scratch/manoelribeiro/helpers/authors_dict.sqlite", tablename="authors", flag="r") for _, value in mydict.items(): if len(value) > 1: t.append(value) ct += 1 if (c >= 11300000): break if (c % 100000) == 0: print(c, ct) c += 1 with open("authors_split_new2.pickle", "wb") as fp: pickle.dump(t, fp, protocol=pickle.HIGHEST_PROTOCOL)
def load_annotations(self, ann_file): self.split = self.img_prefix annotations, image_infos, simpsons_with_prefix = dict(), dict(), dict() # for prefix in ["OPEN_IMAGES", "COCO", "PASCAL", "VISUAL_GENOME"]: for prefix in ["VISUAL_GENOME"]: annotations_sqlite = SqliteDict( f"{ann_file}/{prefix}/annotations.sqlite") image_infos_sqlite = SqliteDict( f"{ann_file}/{prefix}/image_infos.sqlite") annos, infos, simpsons = dict(), dict(), dict() for i, anno in enumerate(tqdm.tqdm(annotations_sqlite.items())): annos[anno[0]] = anno[1] labels = [box["entity"] for box in anno[1].values()] simpson = simpson_di(labels) simpsons[anno[0]] = simpson annotations[prefix] = annos for i, info in enumerate(tqdm.tqdm(image_infos_sqlite.items())): if info[0] in annotations[prefix]: infos[info[0]] = info[1] image_infos[prefix] = infos simpsons_with_prefix[prefix] = simpsons entities, attributes, nats = list(), list(), list() for prefix, annotation in annotations.items(): for image_id, annos in tqdm.tqdm(annotation.items()): for oid, entity_anno in annos.items(): if entity_anno["entity"] not in [ "Q414241", # part, 114 "Q395237", # side, 244 "Q187456", # bar, 277 "Q23444", # white, 279 "Q9659", # A, 310 "Q398475", # This, 329 "Q55634432", # surface, 337 "Q241124", # Row, 451 "Q189171", # Section, 478 ]: entities.append(entity_anno["entity"]) nats.append(entity_anno["natural_language"]) attributes += entity_anno["attributes"] entity_counter, attribute_counter, nat_counter = ( Counter(entities), Counter(attributes), Counter(nats), ) entities = entity_counter.most_common(1600) attributes = attribute_counter.most_common(400) # ipdb.set_trace() # nl2wb = SqliteDict( # f"{ann_file}/OPEN_IMAGES/natural_language_to_wikibase_id.sqlite" # ) # wb2nl = dict() # for k, v in nl2wb.items(): # if v not in wb2nl: # wb2nl[v] = k # nl_entities = list() # for e in entities: # nl_entities.append((wb2nl[e[0]], e[1])) # nl_attributes = list() # for e in attributes: # nl_attributes.append((wb2nl[e[0]], e[1])) # json.dump(nl_entities, open(f"{ann_file}/nl_entities.json", "w"), indent=2) # json.dump(nl_attributes, open(f"{ann_file}/nl_attributes.json", "w"), indent=2) self.CLASSES = list([k for k, v in entities]) self.ATTRIBUTES = list([k for k, v in attributes]) self.cat_ids = self.CLASSES self.attr_ids = self.ATTRIBUTES self.cat2label = { cat_id: i + 1 for i, cat_id in enumerate(self.cat_ids) } self.attr2label = { attr_id: i for i, attr_id in enumerate(self.attr_ids) } image_ids = list() for prefix, annotation in annotations.items(): for key in tqdm.tqdm(annotation): annot = annotation[key] ents = list(annot.values()) ents = [v["entity"] for v in ents] if any([(e in self.CLASSES) for e in ents]): image_ids.append((prefix, key)) self.img_ids = [i[1] for i in image_ids] img_infos, self.annotations, self.simpsons = list(), dict(), dict() for prefix, i in tqdm.tqdm(image_ids): info = image_infos[prefix][i] info["id"] = f"{prefix}__{i}" if prefix == "OPEN_IMAGES": info[ "filename"] = f"OPEN_IMAGES/{info['split']}/{info['split']}/{i[:3]}/{i}.jpg" elif prefix == "PASCAL": info[ "filename"] = f"PASCAL/DATA/VOCdevkit/VOC2012/JPEGImages/{i}.jpg" elif prefix == "COCO": split = info["split"] info[ "filename"] = f"COCO/IMAGES_{split.upper()[:-4]}/{split}/{i}.jpg" elif prefix == "VISUAL_GENOME": info["filename"] = f"VISUAL_GENOME/IMAGE/{i}.jpg" else: raise NotImplementedError img_infos.append(info) self.annotations[info["id"]] = annotations[prefix][i] self.simpsons[info["id"]] = simpsons_with_prefix[prefix][i] self.img_prefix = f"{ann_file}/" if self.split == "train": img_infos = img_infos[:-1000] else: img_infos = img_infos[-1000:] return img_infos
from sqlitedict import SqliteDict import datetime import argparse import json import glob dst_fldr = "/data/savvas/incels/data/" dst = "channels_dict.sqlite" tmp_sqlite = glob.glob(dst_fldr + "tmp/channels_dict*") dict_db = SqliteDict(dst_fldr + dst, tablename="channels", journal_mode="OFF", flag="w") for tmp in tmp_sqlite: print(tmp) tmp_dict = SqliteDict(tmp, tablename="channels", journal_mode="OFF", flag="r") # print("start") for key, item in tmp_dict.items(): # print(key) val = dict_db.get(key, []) val += item # print("1") dict_db[key] = val # print("2") # print("end") dict_db.commit() tmp_dict.close()
def csv_diff(csv_metadata_file, csv_metadata_file_compair, agent, output, indent, verbose, sqlite_dict): """Agencies record diff. :param csv_metadata_file: csv metadata file to compair. :param csv_metadata_file_compair: csv metadata file to compair too. :param agent: agent type to compair too. :param verbose: Verbose. :param sqlite_dict: SqliteDict Db file name. """ def get_pid_data(line): """Get json from CSV text line. :param line: line of CSV text. :returns: data as json """ data = json.loads(line.split('\t')[3].replace('\\\\', '\\')) pid = data.get('pid') return pid, data def ordert_data(data): """Order data pid, . :param line: line of CSV text. :returns: data as json """ data = json.loads(data.split('\t')[3]) pid = data.get('pid') return pid, data offset = '{character:{indent}}'.format(character=' ', indent=indent) def intent_output(data, intent): """Creates intented output. :param data: data to output. :param intent: intent to use. :returns: intented data. """ output = '' lines = json.dumps(data, indent=indent).split('\n') for line in lines: output += '\n{offset}{line}'.format(offset=offset, line=line) return output if csv_metadata_file_compair and not agent: compair = csv_metadata_file_compair elif agent: compair = agent else: click.secho('One of -a or -d parameter mandatory', fg='red') sys.exit(1) click.secho('CSV diff: {first} <-> {second}'.format( first=compair, second=csv_metadata_file), fg='green') if output: file_name = os.path.splitext(csv_metadata_file)[0] file_name_new = '{name}_new.json'.format(name=file_name) file_new = open(file_name_new, 'w') file_new.write('[') file_name_diff = '{name}_changed.json'.format(name=file_name) file_diff = open(file_name_diff, 'w') file_diff.write('[') file_name_delete = '{name}_delete.json'.format(name=file_name) file_delete = open(file_name_delete, 'w') file_delete.write('[') click.echo('New file: {name}'.format(name=file_name_new)) click.echo('Changed file: {name}'.format(name=file_name_diff)) click.echo('Deleted file: {name}'.format(name=file_name_delete)) compaire_data = SqliteDict(sqlite_dict, autocommit=True) if csv_metadata_file_compair and not agent: length = number_records_in_file(csv_metadata_file_compair, 'csv') with open(csv_metadata_file_compair, 'r', buffering=1) as meta_file: label = 'Loading: {name}'.format(name=compair) with click.progressbar(meta_file, length=length, label=label) as metadata: for metadata_line in metadata: pid, data = get_pid_data(metadata_line) compaire_data[pid] = data elif agent: agent_class = get_agent_class(agent) length = agent_class.count() ids = agent_class.get_all_ids() with click.progressbar(ids, length=length) as record_ids: for id in record_ids: record = agent_class.get_record_by_id(id) pid = record.pid compaire_data[pid] = record with open(csv_metadata_file, 'r', buffering=1) as metadata_file: for idx, metadata_line in enumerate(metadata_file): pid, data = get_pid_data(metadata_line) if pid in compaire_data: if compaire_data[pid] != data: click.echo('DIFF: ') click.echo(' old:\t{data}'.format( data=json.dumps(compaire_data[pid], sort_keys=True))) click.echo(' new:\t{data}'.format( data=json.dumps(data, sort_keys=True))) if output: if idx > 0: file_diff.write(',') file_diff.write(intent_output(data, indent)) del (compaire_data[pid]) else: click.echo('NEW :\t{data}'.format( data=json.dumps(data, sort_keys=True))) if output: if idx > 0: file_new.write(',') file_new.write(intent_output(data, indent)) idx = 0 for pid, data in compaire_data.items(): click.echo( 'DEL :\t{data}'.format(data=json.dumps(data, sort_keys=True))) if output: if idx > 0: file_delete.write(',') file_delete.write(intent_output(data, indent)) idx += 1 if output: file_new.write('\n]') file_new.close() file_diff.write('\n]') file_diff.close() file_delete.write('\n]') file_delete.close() sys.exit(0)
class SQLiteRepository(Repository): repository_name: str = 'sqlite' extension: str = '.db' def __init__(self, repository_path: str, commit_on_close: bool = True, verbosity: int = 0): super().__init__(repository_path, commit_on_close=commit_on_close, verbosity=verbosity) self.sqlite_repository = None self.table_name = None @contextmanager def connect(self, table_name: str) -> 'SQLiteRepository': yield self.open(table_name) self.close() def open(self, table_name: str): self.sqlite_repository = SqliteDict( self.repository_path, tablename=table_name, encode=json.dumps, decode=json.loads, flag='c') self.table_name = table_name return self def close(self): if self.sqlite_repository is not None: if self.commit_on_close: self.commit() self.sqlite_repository.close() self.sqlite_repository = None self.table_name = None def commit(self): self.sqlite_repository.commit() def keys(self) -> List[str]: return list(self.sqlite_repository.keys()) def update(self, key: str, update_obj: dict): self.sqlite_repository[key] = update_obj def upsert(self, key: str, obj: dict): self.sqlite_repository[key] = obj def get(self, key: str) -> dict: try: return self.sqlite_repository[key] except KeyError: raise InvalidEntryError(key) def get_multiple(self, keys: List[str]) -> Dict[str, dict]: values = {key: element for key, element in self.sqlite_repository.items() if key in keys} if len(set(keys)) != len(values): invalids = set(keys).difference(values.keys()) raise InvalidEntryError(', '.join(list(invalids))) return values def get_all(self) -> Dict[str, dict]: return {key: element for key, element in self.sqlite_repository.items()} def remove(self, key: str): try: del self.sqlite_repository[key] except KeyError: raise InvalidEntryError(key) def remove_multiple(self, keys: List[str]): for key in keys: self.remove(key) def clear(self): self.sqlite_repository.clear() def __repr__(self): return f"SQLiteRepository(file='{self.repository_path}', open={self.sqlite_repository is not None}, " \ f"table='{self.table_name}')"
# tokens = set(tokens) if '' in tokens: while '' in tokens: tokens.remove('') tokens = list(tokens) return tokens claimLst = [] evidenceLst = [] targetLabelLst = [] ctr_breaker = 0 for claimId, val in training_db.items(): supportsOrRefutes = val[1] if ctr_breaker % 500 == 0: print(ctr_breaker) if ctr_breaker == 150000: break ctr_breaker += 1 bufferEvidenceCombination = [] if supportsOrRefutes != 'NOT ENOUGH INFO': claim = val[2] claimTokens = tokenise_line(claim)
def csv_diff(csv_metadata_file, csv_metadata_file_compair, entity, output, indent, verbose, sqlite_dict): """Entities record diff. :param csv_metadata_file: CSV metadata file to compair. :param csv_metadata_file_compair: CSV metadata file to compair too. :param entity: entity type to compair too. :param verbose: Verbose. :param sqlite_dict: SqliteDict Db file name. """ def get_pid_data(line): """Get JSON from CSV text line. :param line: line of CSV text. :returns: data as json """ data = json.loads(line.split('\t')[3].replace('\\\\', '\\')) pid = data.get('pid') return pid, data def ordert_data(data): """Order data pid, . :param line: line of CSV text. :returns: data as json """ data = json.loads(data.split('\t')[3]) pid = data.get('pid') return pid, data if csv_metadata_file_compair and not entity: compair = csv_metadata_file_compair elif entity: compair = entity else: click.secho('One of -a or -d parameter mandatory', fg='red') sys.exit(1) click.secho(f'CSV diff: {compair} <-> {csv_metadata_file}', fg='green') if output: file_name = os.path.splitext(csv_metadata_file)[0] file_name_new = f'{file_name}_new.json' file_new = JsonWriter(file_name_new) file_name_diff = f'{file_name}_changed.json' file_diff = JsonWriter(file_name_diff) file_name_delete = f'{file_name}_delete.json' file_delete = JsonWriter(file_name_delete) click.echo(f'New file: {file_name_new}') click.echo(f'Changed file: {file_name_diff}') click.echo(f'Deleted file: {file_name_delete}') compaire_data = SqliteDict(sqlite_dict, autocommit=True) if csv_metadata_file_compair and not entity: length = number_records_in_file(csv_metadata_file_compair, 'csv') with open(csv_metadata_file_compair, 'r', buffering=1) as meta_file: label = f'Loading: {compair}' with click.progressbar(meta_file, length=length, label=label) as metadata: for metadata_line in metadata: pid, data = get_pid_data(metadata_line) compaire_data[pid] = data elif entity: entity_class = get_entity_class(entity) length = entity_class.count() ids = entity_class.get_all_ids() with click.progressbar(ids, length=length) as record_ids: for id in record_ids: record = entity_class.get_record_by_id(id) pid = record.pid compaire_data[pid] = record db.session.close() with open(csv_metadata_file, 'r', buffering=1) as metadata_file: for metadata_line in metadata_file: pid, data = get_pid_data(metadata_line) if pid in compaire_data: if compaire_data[pid] != data: if verbose: click.echo('DIFF: ') click.echo( ' old:\t' f'{json.dumps(compaire_data[pid], sort_keys=True)}' ) click.echo( f' new:\t{json.dumps(data, sort_keys=True)}') if output: file_diff.write(data) del (compaire_data[pid]) else: if verbose: click.echo(f'NEW :\t{json.dumps(data, sort_keys=True)}') if output: file_new.write(data) for pid, data in compaire_data.items(): if verbose: click.echo(f'DEL :\t{json.dumps(data, sort_keys=True)}') if output: file_delete.write(data) file_new.close() file_diff.close() file_delete.close() sys.exit(0)
from sqlitedict import SqliteDict source = "/../../../../scratch/manoelribeiro/helpers/text_dict.sqlite" path = './../data/sqlite/split_texts/' print("Start") value_dict = SqliteDict(source, tablename="text", flag="r") print("value_dict") new_value_dict = SqliteDict(f"{path}text_dict_{0}.sqlite", tablename="value", journal_mode='OFF') c = 0 for key, value in value_dict.items(): c += 1 if c % 10000000 == 0: print(c) new_value_dict.commit() new_value_dict.close() new_value_dict = SqliteDict(f'{path}text_dict_{c}.sqlite', tablename="value", journal_mode='OFF') new_value_dict[key] = value new_value_dict.commit() new_value_dict.close()
class MyForm(QMainWindow): def __init__(self): super().__init__() self.ui = Ui_MainWindow() self.ui.setupUi(self) self.ui.button_load_data.clicked.connect(self.load) self.ui.edit_database_path.editingFinished.connect(self.load_from_lineEdit) self.ui.button_save.clicked.connect(self.save) self.ui.button_add.clicked.connect(self.add_entry) self.ui.button_plan.clicked.connect(self.gen_week_table) def closeEvent(self,event): close = QMessageBox() close.setText("Speichern ?") close.setStandardButtons(QMessageBox.Yes | QMessageBox.Cancel) close = close.exec() if close == QMessageBox.Yes: self.save() event.accept() else: event.accept() def load(self): # Load SQLdict #try: self.fname = QFileDialog.getOpenFileName(self, 'Open file','','Database *.sqlite') # gt file path self.ui.edit_database_path.setText('Loaded database: ' + self.fname[0]) # set path to LineEdit self.Kochbuch = SqliteDict(self.fname[0], autocommit=True) # Lade Haupt dictionary/ Gerichte self.create_content_table() #except: #print('Cannot load specified file!\nError in main.load()') #pass def load_from_lineEdit(self): # Load from LineEdit try: self.fname[0] = self.ui.edit_database_path.text()[17:] # get text without prefix self.Kochbuch = SqliteDict(self.fname[0], autocommit=True) # Lade Haupt dictionary/ Gerichte self.create_content_table() except: print('Cannot load specified file!\nError in main.load_from_lineEdit()') pass def save(self): # get items from content_table, update Kochbuch and commit it to database self.Kochbuch.clear() table = self.ui.content_table header_items = [table.model().headerData(i, Qt.Horizontal) for i in range(table.model().columnCount())] for row_index in range(self.ui.content_table.rowCount()): # Every row is one dish/gericht temp_dict = dict() for col_index, item in enumerate(header_items): temp_dict[item] = table.cellWidget(row_index,col_index).text() self.add_gericht(temp_dict) def add_entry(self): # Add empty entry to table row_cnt = self.ui.content_table.rowCount() col_cnt = self.ui.content_table.columnCount() self.ui.content_table.insertRow(row_cnt) for col_index in range(col_cnt): self.ui.content_table.setCellWidget(row_cnt, col_index, QLineEdit()) if col_index == col_cnt - 1: # Delete Option self.ui.content_table.setCellWidget(row_cnt, col_index, QPushButton('Delete')) self.ui.content_table.cellWidget(row_cnt, col_index).clicked.connect(self.remove_entry) def remove_entry(self): table = self.ui.content_table # --------------Remove Row------------ column = table.currentColumn() row = table.currentRow() table.removeRow(row) # -------------Remove dict entry-------- #name = table.cellWidget(row,0).text() #self.del_gericht(name) def create_content_table(self): # Creates the Widgets inside the Table table = self.ui.content_table table.setRowCount(len(self.Kochbuch)) header_items = [table.model().headerData(i, Qt.Horizontal) for i in range(table.model().columnCount())] row_label = [] col_cnt = table.model().columnCount() for row_index, val in enumerate(self.Kochbuch.items()): #row_label.append(str(row_index + 1) + ' ' + str(val[0])) for col_index in range(col_cnt): table.setCellWidget(row_index,col_index,QLineEdit()) if col_index == col_cnt - 1: # Add Delete Button table.setCellWidget(row_index, col_index, QPushButton('Delete')) table.cellWidget(row_index,col_index).clicked.connect(self.remove_entry) #self.ui.content_table.setVerticalHeaderLabels(row_label) self.set_text_to_table(header_items) def set_text_to_table(self,header_items): table = self.ui.content_table for row_index, val in enumerate(self.Kochbuch.items()): table.cellWidget(row_index, 0).setText(val[0]) # Name column/ set Name #print(val[1].values()) for col_index, item in enumerate(header_items[1:]): try: table.cellWidget(row_index, col_index + 1).setText(val[1][item]) except KeyError: if item == None: # Used, that the delete button text will not be overwritten pass else: # Set unfilled category empty table.cellWidget(row_index, col_index + 1).setText('') def add_gericht(self, entries:dict): # Olf func args: name: str, Fisch: bool, Nudeln: bool, Vortag: bool, SE: bool, WE: bool, WE_Wichtung: float, Wichtung: float # Gerichte werden gespeichert in dict() # Jedes Gericht wird dabei Kategorisiert in: # - Fisch: Bool # - Nudeln: Bool # - Vortag: Bool (Wenn Sonntags viel Bestellt wird das Essen vom Vortag machen) # - SE: Bool (Sonntagsessen) # - WE: Bool (Wochenendessen, wie z.B Holen/Bestellen) # - WE_Wichtung: Float (Jedes Gericht soll die chance haben am WE dran zu kommen, Holen/Bestellen oder z.B. Rolladen sollen bevorzugt werden) # - Wichtung: Float (Warkeit des Gerichtes ausgewählt zu werden, um doppelte zu vermeiden) 1.0 = Kommt dran; 0 = wird nicht dran kommen # etc. # ------------------------------------------------------------------------------- # Tortillas = dict() # Ein Gericht dict() # Tortillas['Fisch'] = False # Tortillas['Nudeln'] = False # Tortillas['Vortag'] = False # Tortillas['SE'] = False # Tortillas['WE'] = False # Tortillas['WE_Wichtung'] = 0.1 # Tortillas['Wichtung'] = 1.0 # Gerichte['Tortillas'] = Tortillas # ------------------------------------------------------------------------------- '''name_dict = dict() name_dict['Fisch'] = Fisch name_dict['Nudeln'] = Nudeln name_dict['Vortag'] = Vortag name_dict['SE'] = SE name_dict['WE'] = WE name_dict['WE_Wichtung'] = WE_Wichtung name_dict['Wichtung'] = Wichtung''' name = entries['Name'] self.Kochbuch[name] = entries def del_gericht(self,name): self.Kochbuch.pop(name) def update_kochbuch(self, Kochbuch: dict, name: str, kategorie: str, value: any): # Da hier mit dict im dict gearbeitet wird und dasa äußerste dict ein SQLdict ist, # müssen hier die Einträge ein bisschen umständlich verändert werden, um vom RAM in die SQL datei zu schreiben update = Kochbuch[name] update[kategorie] = value Kochbuch[name] = update def choose(self,dishes): choosed_dish = np.asarray(dishes) Wichtungen = choosed_dish[:,1].astype(np.float) #choosed_dish = np.sort(choosed_dish) # Find maximas in Wichtung column max_indizes = np.where(Wichtungen==np.amax(Wichtungen)) finds = [] for i in max_indizes[0]: finds.append(choosed_dish[i]) # Choose dish # If len of finds > 1 use random int to choose dish if len(finds) > 1: dish_index = random.randint(0,len(finds) - 1) return finds[dish_index] else: return finds #print('test','\n',choosed_dish) def gen_week_table(self): # generate table of dishes day wise dishes = [i for i in self.Kochbuch.items()] dishes_cnt = len(dishes) usable_dishes = [] possible_dishes_mon = [] possible_dishes_tue = [] possible_dishes_wed = [] possible_dishes_thu = [] possible_dishes_fri = [] possible_dishes_sat = [] possible_dishes_sun = [] saison = 'Winter' for index, dish in enumerate(dishes): dish = dish[1] # Perform standard check, to reduce dishes according to seasons and weigth #if float(dish['Wichtung']) > 0.7 and dish['Saison'] == saison: # Standard check if float(dish['Wichtung']) > 0.7 and (dish['Saison'] == saison or dish['Saison'] == 'None'): usable_dishes.append(dish) # -----------Monday------------- # ------------------------------ # Mondays should prefer Nudeln ---> Boni for Nudeln == True if dish['Fisch'] == 'False': if dish['Nudeln'] == 'True': possible_dishes_mon.append([dish['Name'], float(dish['Wichtung']) + 0.3]) else: possible_dishes_mon.append([dish['Name'], float(dish['Wichtung'])]) #----------------------------------------------------------------------------- # -----------Tuesday/Wednesday/Thursday------------- # -------------------------------------------------- # Days without preferations if dish['Fisch'] == 'False' and dish['Vortag'] == 'False': # Standard check possible_dishes_tue.append([dish['Name'], float(dish['Wichtung'])]) possible_dishes_wed.append([dish['Name'], float(dish['Wichtung'])]) possible_dishes_thu.append([dish['Name'], float(dish['Wichtung'])]) # ----------------------------------------------------------------------------- # -----------Friday------------- # ------------------------------ # Fish prefered if dish['WE'] == 'True' and dish['SE'] == 'False': if dish['Fisch'] == 'True': possible_dishes_fri.append([dish['Name'], float(dish['Wichtung']) + 0.3]) else: possible_dishes_fri.append([dish['Name'], float(dish['Wichtung'])]) # ----------------------------------------------------------------------------- # -----------Saturday------------- # -------------------------------- # WE category prefered if dish['Fisch'] == 'False' and dish['SE'] == 'False' and dish['Vortag'] == 'False': if dish['WE'] == 'True': possible_dishes_sat.append([dish['Name'], float(dish['Wichtung']) + 0.3]) else: possible_dishes_sat.append([dish['Name'], float(dish['Wichtung'])]) # ----------------------------------------------------------------------------- # -----------Sunday------------- # ------------------------------ # SE highly prefered if dish['Fisch'] == 'False' and dish['Vortag'] == 'False': if dish['SE'] == 'True': possible_dishes_sun.append([dish['Name'], float(dish['Wichtung']) + 0.5]) else: possible_dishes_sun.append([dish['Name'], float(dish['Wichtung'])]) print('============================================================================') print('=================================Wochenplan=================================') print('============================================================================') print('Monday: ', self.choose(possible_dishes_mon)[0]) print('----------------------------------------------------------------------------') print('Tuesday: ', self.choose(possible_dishes_tue)[0]) print('----------------------------------------------------------------------------') print('Wednesday: ', self.choose(possible_dishes_wed)[0]) print('----------------------------------------------------------------------------') print('Thurday: ', self.choose(possible_dishes_thu)[0]) print('----------------------------------------------------------------------------') print('Friday: ', self.choose(possible_dishes_fri)[0]) print('----------------------------------------------------------------------------') print('Saturday: ', self.choose(possible_dishes_sat)[0]) print('----------------------------------------------------------------------------') print('Sunday: ', self.choose(possible_dishes_sun)[0]) print('============================================================================') #print(self.choose(possible_dishes_mon)[0]) Speiseplan = collections.OrderedDict()