Beispiel #1
0
def persistence_load(db_path=config.WN_FEATURE_CACHE_PATH):
    p_dict = {
        'hypernym_stems_dict': dict(),
        'hyponym_stems_dict': dict(),
        'hyper_lvl_dict': dict(),
        'hypo_lvl_dict': dict(),
        'ant_dict': dict(),
        'em_lemmas_dict': dict(),
    }

    # if em_dict:
    #     p_dict['em_dict'] = dict()

    for dict_name in p_dict.keys():
        print("Loading Persistent WN Feature Dict:", dict_name)
        if dict_name != 'em_dict':
            in_db_dict = SqliteDict(str(db_path / dict_name),
                                    autocommit=False,
                                    tablename='the_table',
                                    flag='c')
            for key, v in tqdm(in_db_dict.items()):
                p_dict[dict_name][key] = v
            in_db_dict.close()
        elif dict_name == 'em_dict':
            in_db_dict = SqliteDict(str(db_path / dict_name),
                                    autocommit=False,
                                    tablename='the_table',
                                    flag='c')
            for key, v in tqdm(in_db_dict.items()):
                p_dict[dict_name][key] = v
            in_db_dict.close()

    return p_dict
Beispiel #2
0
def distribution_from_data_db(data_db: SqliteDict, vec_dim: int):
    sums = np.array([0] * vec_dim).astype(np.float32)
    squares = np.array([0] * vec_dim).astype(np.float32)
    for _, vec in data_db.items():
        sums += vec
    means = sums / len(data_db)
    for _, vec in data_db.items():
        squares += (vec - means)**2
    variances = np.sqrt(squares / len(data_db))
    weights = list(zip(means, variances))
    return weights
Beispiel #3
0
def sqlite_to_array(num):

    p2 = SqliteDict(f"{args.src}perspective_value{num}.sqlite",
                    tablename="value",
                    flag="r")

    c = 0
    t_ini = time()
    ids = []
    perspective = []

    for key, value in p2.items():
        if c % 100000 == 0:
            print("iteration number ", c, "at", round((time() - t_ini) / 60,
                                                      2), "minutes")
        c += 1

        if c % 5000000 == 0:
            save_arrays(num, perspective, ids, c)
            ids = []
            perspective = []

        ids.append(key)
        perspective.append(tuple(value.values()))
    c += 1
    save_arrays(num, perspective, ids, c)
Beispiel #4
0
class CDataBase(object):
    def __init__(self):
        try:
            self.close()
        except:
            pass
        self.mydict = SqliteDict('./DB/my_db.sqlite', autocommit=True)
        self.show()

    def set(self, key, value):
        self.mydict[key] = value

    def get(self, key):
        if key in self.mydict.keys():
            ret = self.mydict[key]
        else:
            ret = None
        return ret

    def show(self, start_with=''):
        for key, value in self.mydict.items():
            if key.find(start_with):
                print(key, '\t', value, '\n')

    def clear(self):
        self.mydict.clear()

    def close(self):
        self.mydict.close()
Beispiel #5
0
def check_for_cached_result(query: str,
                            thresh: float = 0.8,
                            sim=cosine_sim) -> tuple:
    query_map = SqliteDict(query_map_path)
    query_results = SqliteDict(query_db_path)
    # see if we have computed this exact query before
    try:
        result = query_results[query]
        query_vec = query_map[query]
        query_map.close()
        query_results.close()
        return result, query_vec
    except KeyError:
        pass
    # If not, see if any query is close enough
    processed_query = process_query(query)
    max_sim_query, score = get_max_sim(processed_query,
                                       query_map.items(),
                                       sim=sim)
    if score > thresh:
        result = query_results[max_sim_query]
        query_map.close()
        query_results.close()
        return result
    query_map.close()
    query_results.close()
    return [], processed_query
Beispiel #6
0
def process_db(dbpath, options):
    output_count = 0
    # No context manager (and no close()) as this is read-only and
    # close() can block for a long time for no apparent reason.
    db = SqliteDict(dbpath, flag='r', autocommit=False)
    for key, value in db.items():
        root, ext = os.path.splitext(key)
        if ext != options.suffix:
            continue
        if options.random is not None and options.random < random():
            continue

        if options.id_prefix is None:
            doc_id = root
        else:
            doc_id = options.id_prefix + root

        text = value.rstrip('\n').replace('\n', ' ').replace('\t', ' ')

        print('{}\t<AUTHORS>\t<JOURNAL>\t<YEAR>\t{}'.format(doc_id, text))

        output_count += 1
        if options.limit is not None and output_count >= options.limit:
            break

    return output_count
Beispiel #7
0
class TagsData(object):
    _tags_db_path = './src/server/tags.sqlite'

    def __init__(self, db_path=_tags_db_path):
        self._tags_db = SqliteDict(db_path, tablename='tags', autocommit=True)
        self._metadata = self.compute_metadata()

    def compute_metadata(self):
        res = []
        for tag_user, data in self._tags_db.items():
            tag, user = tag_user.split(':')
            num_tagged = len(data['pos']) + len(data['neg']) + len(data['unk'])
            res.append({'tag': tag, 'num': num_tagged, 'user': user})
        res.sort(key=lambda item: item['num'], reverse=True)
        return res

    def get_metadata(self):
        return self._metadata

    def get_data(self, tag, user):
        tag_user = tag + ':' + user
        if tag_user in self._tags_db:
            return self._tags_db[tag_user]
        return {'pos': [], 'neg': [], 'unk': []}

    def save_data(self, tag, user, pos, neg, unk):
        tag_user = tag + ':' + user
        self._tags_db[tag_user] = {'pos': pos, 'neg': neg, 'unk': unk}
        # recompute metadata.
        # This could be done more efficiently with an incremental update
        self._metadata = self.compute_metadata()
Beispiel #8
0
    def check(self,
              unique_pos_in='data/pos_unique.tab',
              unique_pos_out='data/pos_unique.db',
              unique_ner_in='data/ner_unique.tab',
              unique_ner_out='data/ner_unique.db'):
        """ 
        init constants from data POS NER

        Keyword arguments:
        pos_in pos_out and ner_in ner_out

        pos_in line structure as [tag \t description ] : NOUN    noun, singular or mass
        ner_in line structure as [type \t description ] : ORG	Companies, agencies, institutions, etc.

        Raises:
            Exception:  check your pos data path or check your ner data path
        """
        if not os.path.exists(unique_pos_out):
            if not os.path.exists(unique_pos_in):
                raise Exception(
                    'check your pos unique path ({})'.format(unique_pos_in))
            else:
                unique_pos = pd.read_csv(unique_pos_in, sep='\t')
                with SqliteDict(unique_pos_out, autocommit=True) as db:
                    for p in unique_pos.iterrows():
                        db[p[1][0]] = p[1][1]
                    self.POS = list(db.items())
        else:
            db = SqliteDict(unique_pos_out)
            self.POS = list(db.items())
            db.close()

        if not os.path.exists(unique_ner_out):
            if not os.path.exists(unique_ner_in):
                raise Exception(
                    'check your ner unique path ({})'.format(unique_ner_in))
            else:
                unique_ner = pd.read_csv(unique_ner_in, sep='\t')
                with SqliteDict(unique_ner_out, autocommit=True) as db:
                    for p in unique_ner.iterrows():
                        db[p[1][0]] = p[1][1]
                    self.NER = list(db.items())
        else:
            db = SqliteDict(unique_ner_out)
            self.NER = list(db.items())
            db.close()
Beispiel #9
0
def next_batch_from_dataset(data_db: SqliteDict, batch_size=20000):
    qkeys = []
    feature_vectors = []
    for q, vec in data_db.items():
        qkeys.append(q)
        feature_vectors.append(vec)
        if len(qkeys) == batch_size:
            yield qkeys, feature_vectors
            qkeys = []
            feature_vectors = []
    if len(qkeys) > 0:
        yield qkeys, feature_vectors
Beispiel #10
0
    def gen_doc(self, is_pos=True):

        if is_pos:
            path = 'ui/public/pos.tab'
            db = SqliteDict(self.path_pos)
            pos = pd.DataFrame.from_dict(db.items())
            db.close()
            pos.to_csv(path,
                       sep='\t',
                       index=None,
                       encoding='utf8',
                       header=None)
            return 'generate pos file in : {}'.format(path)
        else:
            path = 'ui/public/ner.tab'
            db = SqliteDict(self.path_ner)
            ner = pd.DataFrame.from_dict(db.items())
            db.close()
            ner.to_csv(path,
                       sep='\t',
                       index=None,
                       encoding='utf8',
                       header=None)
            return 'generate ner file in : {}'.format(path)
Beispiel #11
0
def get_nearest(query_vec: BagOfWordsVector,
                k: int = 20,
                thresh: int = 0,
                sim=cosine_sim,
                return_all: bool = False) -> list:
    # Generate tuple list with entries in the form of (<doc_id>, <doc_vector>)
    db = SqliteDict(doc_vecs_db_path)
    doc_pairs = [(key, value.vector) for key, value in db.items()]
    db.close()
    if thresh != 0:
        results = search_by_threshold(query_vec, doc_pairs, thresh, sim=sim)
    else:
        results = search_by_knn(query_vec,
                                doc_pairs,
                                k,
                                sim=sim,
                                return_all=return_all)
    return results
Beispiel #12
0
def sqlite_to_array(num):
    emp_sql = SqliteDict(f"{args.src}empath_value{num}.sqlite", tablename="value", flag="r")
    t_ini = time()
    ids = []
    emp_values = []
    c = 0

    for key, value in emp_sql.items():
        if c % 1000000 == 0:
            print("iteration number ", c, "at", round((time()-t_ini)/60, 2), "minutes")
        c += 1

        ids.append(key)
        emp_values.append(tuple(value.values()))

        if c % 10000000 == 0:
            save_arrays(emp_values, ids, c)
            ids = []
            emp_values = []

    save_arrays(num, emp_values, ids, c)
def main():
    args = docopt(__doc__, help=True)
    input_file = args['<unsplash_sqlite_file>']
    output_file = args['<output_sqlite_file>']
    input_data = SqliteDict(input_file, tablename='images', flag='r')
    tag_data = defaultdict(list)
    for img, img_data in input_data.items():
        if 'tags' in img_data and 'urls' in img_data:
            for tag in img_data['tags']:
                tag_data[tag] += [img]
    with SqliteDict(output_file, tablename='tags',
                    autocommit=True) as output_data:
        output_data = {'unsplash': {}}
        for tag in tag_data:
            img_list = tag_data[tag]
            if len(img_list) >= MIN_IMAGE_THRESHOLD:
                output_data['unsplash'][tag] = {
                    'pos': img_list,
                    'neg': [],
                    'unk': [],
                }
Beispiel #14
0
            tokens.remove('')

    tokens = list(tokens)
    return tokens


X_test = []
y_test = []
'''
Build testing dataset
'''
print('Building testing dataset')
ctr_breaker = 0
start_time = time.time()

for claimId, val in testing_db.items():
    supportsOrRefutes = val[1]

    if ctr_breaker % 500 == 0:
        print(ctr_breaker)

    if ctr_breaker == 30000:
        break

    ctr_breaker += 1

    if supportsOrRefutes != 'NOT ENOUGH INFO':
        claim = val[2]

        claimTokens = tokenise_line(claim)
Beispiel #15
0
from sqlitedict import SqliteDict
import numpy as np

linksDB = SqliteDict('phase2-links.sqlite', autocommit=True)
metadataDB = SqliteDict('phase2-metadata.sqlite', autocommit=True)

d = 0.1
pageRanks = {}
pageRanksTemp = {}
for page, metadata in metadataDB.items():
    pageRanks[page] = 1
    pageRanksTemp[page] = 1

numberIterations = 3
for iteration in range(numberIterations):
    for page, metadata in metadataDB.items():
        score = 0
        for childPage, metadata in metadataDB.items():
            if int(page) in linksDB[childPage]['out']:
                score += pageRanks[childPage] / len(linksDB[childPage]['out'])
        pageRanksTemp[page] = d + (1 - d) * score
    pageRanks = pageRanksTemp

for pageID, rank in pageRanks.items():
    data = metadataDB[pageID]
    index = len(data)
    np.insert(data, index, rank)
Beispiel #16
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--dnscache',
                        default="dnscache.sqld",
                        help='IP address cache default: %(default)s')
    parser.add_argument(
        '--download',
        default="pages.sqld",
        help='Here is where the downloaded pages go: %(default)s')
    parser.add_argument(
        '--r404',
        default="404.sqld",
        help='Here is where we remember pages that gave 404 etc: %(default)s')
    args = parser.parse_args()

    #2) Results setup
    result_store = SqliteDict(args.download,
                              encode=gz_encode,
                              decode=gz_decode,
                              autocommit=True)

    for url, cont in result_store.items():
        print(url, cont[:30])

    #3) 404 setup
    r404 = SqliteDict(args.r404, autocommit=True)
    for url, status in r404.items():
        print(url, status)
Beispiel #17
0
from sqlitedict import SqliteDict
from datetime import datetime
import pickle

source = "./../data/sqlite/community_texts/"
path = "./../data/sentiment/community_id/"
names_list = ["right-center", "Alt-right", "center", "right", "left", "left-center", "IDW", "Alt-lite"]
for name in names_list:
    community = SqliteDict(f"{source}{name}.sqlite", tablename="value", flag="r")
    ks = {}
    c = 0
    for key, value in community.items():
        if c % 1000000 == 0:
            print("Iteration number", c)
        c += 1

        ks[key] = datetime.fromtimestamp(value["timestamp"]//1000).year

    with open(f'{path}{name}.pickle', 'wb') as handle:
        pickle.dump(ks, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("pickle")
Beispiel #18
0
class BackendDbHandler(object):
    """Table structure

    target_pages: A table to save URL where folklore is. Key-value pair. {url_string: TargetPage object}
    target_html: A table to save HTML of folklore. Key-value pair. {url_string: ExtractedPage object}
    """
    def __init__(self, path_db_file: str, interval: int = 3):
        self.db_target_pages = SqliteDict(path_db_file,
                                          autocommit=True,
                                          tablename='target_pages',
                                          encode=json.dumps,
                                          decode=json.loads)
        self.db_html = SqliteDict(path_db_file,
                                  autocommit=True,
                                  tablename='target_html',
                                  encode=json.dumps,
                                  decode=json.loads)
        self.interval = interval

    def save_target_urls(self, target_urls: List[str]):
        """Save target URL into DB."""
        for url in target_urls:
            if url not in self.db_target_pages:
                data, errs = TargetPage(strict=True).load({
                    'page_url': url,
                    'status': False,
                    'note': '',
                    'extracted_at': ''
                })
                self.db_target_pages[url] = data
            else:
                logger.info('URL={} is already in target. Skip.'.format(url))
        else:
            self.db_target_pages.commit()

    def run_html_extraction(self,
                            is_force_retry: bool = False,
                            limit: int = -1):
        """Gets all target page and save them into DB."""
        default_i = 0
        for url, page_obj in tqdm(list(self.db_target_pages.items())):
            _obj = TargetPage(strict=True).load(page_obj)
            if page_obj['status'] is False or is_force_retry is True:
                try:
                    html_doc = requests.get(url).text
                    error_msg = ''
                    status = True
                except ExtractedPage as e:
                    html_doc = ''
                    error_msg = e.__str__()
                    status = False

                data, errs = ExtractedPage(strict=True).load({
                    'page_url':
                    url,
                    'status':
                    status,
                    'html_document':
                    html_doc,
                    'note':
                    error_msg,
                    'extracted_at':
                    datetime.now().__str__()
                })
                page_obj['status'] = True
                page_obj['extracted_at'] = datetime.now().__str__()
                self.db_target_pages[url] = data
                default_i += 1
                time.sleep(self.interval)
                if default_i == limit:
                    logger.info('Terminated by limit={}'.format(limit))
                    break
            else:
                logger.info('URL={} is already in target. Skip.'.format(url))
        else:
            self.db_target_pages.commit()
            self.db_html.commit()

    def show_extracted_html(self) -> List[Dict[str, Any]]:
        __ = []
        for url, obj_ in self.db_target_pages.items():
            data, errs = ExtractedPage(strict=True).load(obj_)
            if data['status']:
                __.append(obj_)
        else:
            return __
Beispiel #19
0
# Creates a sqlite for each category

from sqlitedict import SqliteDict

splits = [i*10000000 for i in range(0, 8)]
source = './../data/sqlite/split_texts/'
path = "./../data/sqlite/community_texts/"
actual_category = "none"
category_dict = SqliteDict(f"{path}AL.sqlite", tablename="value", journal_mode="OFF")
text_dict = SqliteDict(f"{source}text_dict_{0}.sqlite", tablename="value", flag="r")

c = 0
for num in splits:
	category_dict.commit()
	text_dict.close()
	text_dict = SqliteDict(f"{source}text_dict_{num}.sqlite", tablename="value", flag="r")

	print(num)
	for id_c, value in text_dict.items():
		if value["category"] != actual_category:
			category_dict.commit()
			category_dict.close()
			category_dict = SqliteDict(f"{path}{value['category']}.sqlite", tablename="value", journal_mode="OFF")
			category_dict[id_c] = value

category_dict.commit()
category_dict.close()
Beispiel #20
0
                               journal_mode='OFF')
page_title_token_positionsDB = SqliteDict(
    'phase2-page_title_token_positions.sqlite', journal_mode='OFF')
page_title_inverted_index = SqliteDict(
    'phase2-page_title_inverted_index.sqlite', journal_mode='OFF')

# cache heavily accessed dicts in main memory as python dicts
pageID2tf = {}
for key in metadataDB.keys():
    pageID2tf[int(key)] = np.uint16(metadataDB[key][5])

# invertedIndex = dict((int(k),v) for k,v in invertedIndex.items())
# page_title_inverted_index = dict((int(k),v) for k,v in page_title_inverted_index.items())

# invert mappings
id2token = dict((v, k) for k, v in token2id.items())
id2page = dict((v, k) for k, v in page2id.items())


@app.route('/')
def startpage():
    return render_template('startpage.html')


@app.route('/result', methods=['POST', 'GET'])
def result():
    if request.method == 'POST':
        result = request.form

        for key, value in result.items():
            query = value
Beispiel #21
0
                              'v1alpha1',
                              developerKey=api_key)

    # Initiating the DataBases:
    dict_c = SqliteDict(args.src, tablename="text", flag="r")
    value_dict = SqliteDict(args.dst, tablename="value", journal_mode='OFF')

    # Initiating multi-process pool:
    workers = 500  # The number 20 was chosen because it best fit the # of requests/second
    p = Pool(workers, initializer=initialize_worker)

    time_iter = time()
    print("bla", args.init, args.end)
    to_request = [
        (k, v["text"], args.dst)
        for k, v in itertools.islice(dict_c.items(), args.init, args.end)
    ]
    time_end = time()
    dif = (args.end - args.init) // args.loop
    print(f"Time to iter: {round((time_end - time_iter) / 60, 2)}")
    for i in range(args.loop):
        time_init = time()
        p.starmap(process_text, to_request[i * dif:(i + 1) * dif])
        time_end = time()
        dt = time_end - time_init
        print(f"Time to run the {i} loop is {round(dt/60, 2)}")
        if i != args.loop - 1:
            sleep(100 - dt)

    # Running Perspective
    # add_perspective(to_request, dict_c, value_dict, p)
Beispiel #22
0
from sqlitedict import SqliteDict
import pickle
import itertools

t = []
ct = 0
c = 0
mydict = SqliteDict(
    "/../../../../../scratch/manoelribeiro/helpers/authors_dict.sqlite",
    tablename="authors",
    flag="r")

for _, value in mydict.items():
    if len(value) > 1:
        t.append(value)
        ct += 1
    if (c >= 11300000):
        break
    if (c % 100000) == 0:
        print(c, ct)
    c += 1

with open("authors_split_new2.pickle", "wb") as fp:
    pickle.dump(t, fp, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #23
0
    def load_annotations(self, ann_file):
        self.split = self.img_prefix

        annotations, image_infos, simpsons_with_prefix = dict(), dict(), dict()
        # for prefix in ["OPEN_IMAGES", "COCO", "PASCAL", "VISUAL_GENOME"]:
        for prefix in ["VISUAL_GENOME"]:
            annotations_sqlite = SqliteDict(
                f"{ann_file}/{prefix}/annotations.sqlite")
            image_infos_sqlite = SqliteDict(
                f"{ann_file}/{prefix}/image_infos.sqlite")
            annos, infos, simpsons = dict(), dict(), dict()
            for i, anno in enumerate(tqdm.tqdm(annotations_sqlite.items())):
                annos[anno[0]] = anno[1]
                labels = [box["entity"] for box in anno[1].values()]
                simpson = simpson_di(labels)
                simpsons[anno[0]] = simpson

            annotations[prefix] = annos

            for i, info in enumerate(tqdm.tqdm(image_infos_sqlite.items())):
                if info[0] in annotations[prefix]:
                    infos[info[0]] = info[1]

            image_infos[prefix] = infos
            simpsons_with_prefix[prefix] = simpsons

        entities, attributes, nats = list(), list(), list()
        for prefix, annotation in annotations.items():
            for image_id, annos in tqdm.tqdm(annotation.items()):
                for oid, entity_anno in annos.items():
                    if entity_anno["entity"] not in [
                            "Q414241",  # part, 114
                            "Q395237",  # side, 244
                            "Q187456",  # bar, 277
                            "Q23444",  # white, 279
                            "Q9659",  # A, 310
                            "Q398475",  # This, 329
                            "Q55634432",  # surface, 337
                            "Q241124",  # Row, 451
                            "Q189171",  # Section, 478
                    ]:
                        entities.append(entity_anno["entity"])
                        nats.append(entity_anno["natural_language"])
                        attributes += entity_anno["attributes"]

        entity_counter, attribute_counter, nat_counter = (
            Counter(entities),
            Counter(attributes),
            Counter(nats),
        )
        entities = entity_counter.most_common(1600)
        attributes = attribute_counter.most_common(400)

        # ipdb.set_trace()
        # nl2wb = SqliteDict(
        #     f"{ann_file}/OPEN_IMAGES/natural_language_to_wikibase_id.sqlite"
        # )
        # wb2nl = dict()
        # for k, v in nl2wb.items():
        #     if v not in wb2nl:
        #         wb2nl[v] = k
        # nl_entities = list()
        # for e in entities:
        #     nl_entities.append((wb2nl[e[0]], e[1]))
        # nl_attributes = list()
        # for e in attributes:
        #     nl_attributes.append((wb2nl[e[0]], e[1]))
        # json.dump(nl_entities, open(f"{ann_file}/nl_entities.json", "w"), indent=2)
        # json.dump(nl_attributes, open(f"{ann_file}/nl_attributes.json", "w"), indent=2)

        self.CLASSES = list([k for k, v in entities])
        self.ATTRIBUTES = list([k for k, v in attributes])

        self.cat_ids = self.CLASSES
        self.attr_ids = self.ATTRIBUTES
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.attr2label = {
            attr_id: i
            for i, attr_id in enumerate(self.attr_ids)
        }

        image_ids = list()
        for prefix, annotation in annotations.items():
            for key in tqdm.tqdm(annotation):
                annot = annotation[key]
                ents = list(annot.values())
                ents = [v["entity"] for v in ents]
                if any([(e in self.CLASSES) for e in ents]):
                    image_ids.append((prefix, key))

        self.img_ids = [i[1] for i in image_ids]

        img_infos, self.annotations, self.simpsons = list(), dict(), dict()
        for prefix, i in tqdm.tqdm(image_ids):
            info = image_infos[prefix][i]
            info["id"] = f"{prefix}__{i}"
            if prefix == "OPEN_IMAGES":
                info[
                    "filename"] = f"OPEN_IMAGES/{info['split']}/{info['split']}/{i[:3]}/{i}.jpg"
            elif prefix == "PASCAL":
                info[
                    "filename"] = f"PASCAL/DATA/VOCdevkit/VOC2012/JPEGImages/{i}.jpg"
            elif prefix == "COCO":
                split = info["split"]
                info[
                    "filename"] = f"COCO/IMAGES_{split.upper()[:-4]}/{split}/{i}.jpg"
            elif prefix == "VISUAL_GENOME":
                info["filename"] = f"VISUAL_GENOME/IMAGE/{i}.jpg"
            else:
                raise NotImplementedError

            img_infos.append(info)
            self.annotations[info["id"]] = annotations[prefix][i]
            self.simpsons[info["id"]] = simpsons_with_prefix[prefix][i]

        self.img_prefix = f"{ann_file}/"

        if self.split == "train":
            img_infos = img_infos[:-1000]
        else:
            img_infos = img_infos[-1000:]

        return img_infos
Beispiel #24
0
from sqlitedict import SqliteDict
import datetime
import argparse
import json
import glob

dst_fldr = "/data/savvas/incels/data/"
dst = "channels_dict.sqlite"
tmp_sqlite = glob.glob(dst_fldr + "tmp/channels_dict*")
dict_db = SqliteDict(dst_fldr + dst,
                     tablename="channels",
                     journal_mode="OFF",
                     flag="w")
for tmp in tmp_sqlite:
    print(tmp)
    tmp_dict = SqliteDict(tmp,
                          tablename="channels",
                          journal_mode="OFF",
                          flag="r")
    # print("start")
    for key, item in tmp_dict.items():
        # print(key)
        val = dict_db.get(key, [])
        val += item
        # print("1")
        dict_db[key] = val
        # print("2")
    # print("end")
    dict_db.commit()
    tmp_dict.close()
Beispiel #25
0
def csv_diff(csv_metadata_file, csv_metadata_file_compair, agent, output,
             indent, verbose, sqlite_dict):
    """Agencies record diff.

    :param csv_metadata_file: csv metadata file to compair.
    :param csv_metadata_file_compair: csv metadata file to compair too.
    :param agent: agent type to compair too.
    :param verbose: Verbose.
    :param sqlite_dict: SqliteDict Db file name.
    """
    def get_pid_data(line):
        """Get json from CSV text line.

        :param line: line of CSV text.
        :returns: data as json
        """
        data = json.loads(line.split('\t')[3].replace('\\\\', '\\'))
        pid = data.get('pid')
        return pid, data

    def ordert_data(data):
        """Order data pid, .

        :param line: line of CSV text.
        :returns: data as json
        """
        data = json.loads(data.split('\t')[3])
        pid = data.get('pid')
        return pid, data

    offset = '{character:{indent}}'.format(character=' ', indent=indent)

    def intent_output(data, intent):
        """Creates intented output.

        :param data: data to output.
        :param intent: intent to use.
        :returns: intented data.
        """
        output = ''
        lines = json.dumps(data, indent=indent).split('\n')
        for line in lines:
            output += '\n{offset}{line}'.format(offset=offset, line=line)
        return output

    if csv_metadata_file_compair and not agent:
        compair = csv_metadata_file_compair
    elif agent:
        compair = agent
    else:
        click.secho('One of -a or -d parameter mandatory', fg='red')
        sys.exit(1)
    click.secho('CSV diff: {first} <-> {second}'.format(
        first=compair, second=csv_metadata_file),
                fg='green')
    if output:
        file_name = os.path.splitext(csv_metadata_file)[0]
        file_name_new = '{name}_new.json'.format(name=file_name)
        file_new = open(file_name_new, 'w')
        file_new.write('[')
        file_name_diff = '{name}_changed.json'.format(name=file_name)
        file_diff = open(file_name_diff, 'w')
        file_diff.write('[')
        file_name_delete = '{name}_delete.json'.format(name=file_name)
        file_delete = open(file_name_delete, 'w')
        file_delete.write('[')
        click.echo('New     file: {name}'.format(name=file_name_new))
        click.echo('Changed file: {name}'.format(name=file_name_diff))
        click.echo('Deleted file: {name}'.format(name=file_name_delete))

    compaire_data = SqliteDict(sqlite_dict, autocommit=True)
    if csv_metadata_file_compair and not agent:
        length = number_records_in_file(csv_metadata_file_compair, 'csv')
        with open(csv_metadata_file_compair, 'r', buffering=1) as meta_file:
            label = 'Loading: {name}'.format(name=compair)
            with click.progressbar(meta_file, length=length,
                                   label=label) as metadata:
                for metadata_line in metadata:
                    pid, data = get_pid_data(metadata_line)
                    compaire_data[pid] = data
    elif agent:
        agent_class = get_agent_class(agent)
        length = agent_class.count()
        ids = agent_class.get_all_ids()
        with click.progressbar(ids, length=length) as record_ids:
            for id in record_ids:
                record = agent_class.get_record_by_id(id)
                pid = record.pid
                compaire_data[pid] = record

    with open(csv_metadata_file, 'r', buffering=1) as metadata_file:
        for idx, metadata_line in enumerate(metadata_file):
            pid, data = get_pid_data(metadata_line)
            if pid in compaire_data:
                if compaire_data[pid] != data:
                    click.echo('DIFF: ')
                    click.echo(' old:\t{data}'.format(
                        data=json.dumps(compaire_data[pid], sort_keys=True)))
                    click.echo(' new:\t{data}'.format(
                        data=json.dumps(data, sort_keys=True)))
                    if output:
                        if idx > 0:
                            file_diff.write(',')
                        file_diff.write(intent_output(data, indent))
                del (compaire_data[pid])
            else:
                click.echo('NEW :\t{data}'.format(
                    data=json.dumps(data, sort_keys=True)))
                if output:
                    if idx > 0:
                        file_new.write(',')
                    file_new.write(intent_output(data, indent))
    idx = 0
    for pid, data in compaire_data.items():
        click.echo(
            'DEL :\t{data}'.format(data=json.dumps(data, sort_keys=True)))
        if output:
            if idx > 0:
                file_delete.write(',')
            file_delete.write(intent_output(data, indent))
            idx += 1

    if output:
        file_new.write('\n]')
        file_new.close()
        file_diff.write('\n]')
        file_diff.close()
        file_delete.write('\n]')
        file_delete.close()
    sys.exit(0)
Beispiel #26
0
class SQLiteRepository(Repository):
    repository_name: str = 'sqlite'
    extension: str = '.db'

    def __init__(self, repository_path: str, commit_on_close: bool = True,
                 verbosity: int = 0):
        super().__init__(repository_path, commit_on_close=commit_on_close,
                         verbosity=verbosity)
        self.sqlite_repository = None
        self.table_name = None

    @contextmanager
    def connect(self, table_name: str) -> 'SQLiteRepository':
        yield self.open(table_name)
        self.close()

    def open(self, table_name: str):
        self.sqlite_repository = SqliteDict(
            self.repository_path, tablename=table_name, encode=json.dumps,
            decode=json.loads, flag='c')
        self.table_name = table_name
        return self

    def close(self):
        if self.sqlite_repository is not None:
            if self.commit_on_close:
                self.commit()
            self.sqlite_repository.close()
        self.sqlite_repository = None
        self.table_name = None

    def commit(self):
        self.sqlite_repository.commit()

    def keys(self) -> List[str]:
        return list(self.sqlite_repository.keys())

    def update(self, key: str, update_obj: dict):
        self.sqlite_repository[key] = update_obj

    def upsert(self, key: str, obj: dict):
        self.sqlite_repository[key] = obj

    def get(self, key: str) -> dict:
        try:
            return self.sqlite_repository[key]
        except KeyError:
            raise InvalidEntryError(key)

    def get_multiple(self, keys: List[str]) -> Dict[str, dict]:
        values = {key: element for key, element in self.sqlite_repository.items() if key in keys}
        if len(set(keys)) != len(values):
            invalids = set(keys).difference(values.keys())
            raise InvalidEntryError(', '.join(list(invalids)))
        return values

    def get_all(self) -> Dict[str, dict]:
        return {key: element for key, element in self.sqlite_repository.items()}

    def remove(self, key: str):
        try:
            del self.sqlite_repository[key]
        except KeyError:
            raise InvalidEntryError(key)

    def remove_multiple(self, keys: List[str]):
        for key in keys:
            self.remove(key)

    def clear(self):
        self.sqlite_repository.clear()

    def __repr__(self):
        return f"SQLiteRepository(file='{self.repository_path}', open={self.sqlite_repository is not None}, " \
               f"table='{self.table_name}')"
Beispiel #27
0
    # tokens = set(tokens)
    if '' in tokens:
        while '' in tokens:
            tokens.remove('')

    tokens = list(tokens)
    return tokens


claimLst = []
evidenceLst = []
targetLabelLst = []
ctr_breaker = 0

for claimId, val in training_db.items():
    supportsOrRefutes = val[1]

    if ctr_breaker % 500 == 0:
        print(ctr_breaker)

    if ctr_breaker == 150000:
        break

    ctr_breaker += 1

    bufferEvidenceCombination = []

    if supportsOrRefutes != 'NOT ENOUGH INFO':
        claim = val[2]
        claimTokens = tokenise_line(claim)
Beispiel #28
0
def csv_diff(csv_metadata_file, csv_metadata_file_compair, entity, output,
             indent, verbose, sqlite_dict):
    """Entities record diff.

    :param csv_metadata_file: CSV metadata file to compair.
    :param csv_metadata_file_compair: CSV metadata file to compair too.
    :param entity: entity type to compair too.
    :param verbose: Verbose.
    :param sqlite_dict: SqliteDict Db file name.
    """
    def get_pid_data(line):
        """Get JSON from CSV text line.

        :param line: line of CSV text.
        :returns: data as json
        """
        data = json.loads(line.split('\t')[3].replace('\\\\', '\\'))
        pid = data.get('pid')
        return pid, data

    def ordert_data(data):
        """Order data pid, .

        :param line: line of CSV text.
        :returns: data as json
        """
        data = json.loads(data.split('\t')[3])
        pid = data.get('pid')
        return pid, data

    if csv_metadata_file_compair and not entity:
        compair = csv_metadata_file_compair
    elif entity:
        compair = entity
    else:
        click.secho('One of -a or -d parameter mandatory', fg='red')
        sys.exit(1)
    click.secho(f'CSV diff: {compair} <-> {csv_metadata_file}', fg='green')
    if output:
        file_name = os.path.splitext(csv_metadata_file)[0]
        file_name_new = f'{file_name}_new.json'
        file_new = JsonWriter(file_name_new)
        file_name_diff = f'{file_name}_changed.json'
        file_diff = JsonWriter(file_name_diff)
        file_name_delete = f'{file_name}_delete.json'
        file_delete = JsonWriter(file_name_delete)
        click.echo(f'New     file: {file_name_new}')
        click.echo(f'Changed file: {file_name_diff}')
        click.echo(f'Deleted file: {file_name_delete}')

    compaire_data = SqliteDict(sqlite_dict, autocommit=True)
    if csv_metadata_file_compair and not entity:
        length = number_records_in_file(csv_metadata_file_compair, 'csv')
        with open(csv_metadata_file_compair, 'r', buffering=1) as meta_file:
            label = f'Loading: {compair}'
            with click.progressbar(meta_file, length=length,
                                   label=label) as metadata:
                for metadata_line in metadata:
                    pid, data = get_pid_data(metadata_line)
                    compaire_data[pid] = data
    elif entity:
        entity_class = get_entity_class(entity)
        length = entity_class.count()
        ids = entity_class.get_all_ids()
        with click.progressbar(ids, length=length) as record_ids:
            for id in record_ids:
                record = entity_class.get_record_by_id(id)
                pid = record.pid
                compaire_data[pid] = record

    db.session.close()
    with open(csv_metadata_file, 'r', buffering=1) as metadata_file:
        for metadata_line in metadata_file:
            pid, data = get_pid_data(metadata_line)
            if pid in compaire_data:
                if compaire_data[pid] != data:
                    if verbose:
                        click.echo('DIFF: ')
                        click.echo(
                            ' old:\t'
                            f'{json.dumps(compaire_data[pid], sort_keys=True)}'
                        )
                        click.echo(
                            f' new:\t{json.dumps(data, sort_keys=True)}')
                    if output:
                        file_diff.write(data)
                del (compaire_data[pid])
            else:
                if verbose:
                    click.echo(f'NEW :\t{json.dumps(data, sort_keys=True)}')
                if output:
                    file_new.write(data)

    for pid, data in compaire_data.items():
        if verbose:
            click.echo(f'DEL :\t{json.dumps(data, sort_keys=True)}')
        if output:
            file_delete.write(data)
    file_new.close()
    file_diff.close()
    file_delete.close()
    sys.exit(0)
Beispiel #29
0
from sqlitedict import SqliteDict

source = "/../../../../scratch/manoelribeiro/helpers/text_dict.sqlite"
path = './../data/sqlite/split_texts/'

print("Start")
value_dict = SqliteDict(source, tablename="text", flag="r")
print("value_dict")
new_value_dict = SqliteDict(f"{path}text_dict_{0}.sqlite",
                            tablename="value",
                            journal_mode='OFF')

c = 0
for key, value in value_dict.items():
    c += 1
    if c % 10000000 == 0:
        print(c)
        new_value_dict.commit()
        new_value_dict.close()

        new_value_dict = SqliteDict(f'{path}text_dict_{c}.sqlite',
                                    tablename="value",
                                    journal_mode='OFF')

    new_value_dict[key] = value

new_value_dict.commit()
new_value_dict.close()
Beispiel #30
0
class MyForm(QMainWindow):
    def __init__(self):
        super().__init__()
        self.ui = Ui_MainWindow()
        self.ui.setupUi(self)

        self.ui.button_load_data.clicked.connect(self.load)
        self.ui.edit_database_path.editingFinished.connect(self.load_from_lineEdit)
        self.ui.button_save.clicked.connect(self.save)
        self.ui.button_add.clicked.connect(self.add_entry)
        self.ui.button_plan.clicked.connect(self.gen_week_table)

    def closeEvent(self,event):
        close = QMessageBox()
        close.setText("Speichern ?")
        close.setStandardButtons(QMessageBox.Yes | QMessageBox.Cancel)
        close = close.exec()
        if close == QMessageBox.Yes:
            self.save()
            event.accept()
        else:
            event.accept()

    def load(self):
        # Load SQLdict
        #try:
            self.fname = QFileDialog.getOpenFileName(self, 'Open file','','Database *.sqlite') # gt file path
            self.ui.edit_database_path.setText('Loaded database: ' + self.fname[0]) # set path to LineEdit
            self.Kochbuch = SqliteDict(self.fname[0], autocommit=True)  # Lade Haupt dictionary/ Gerichte
            self.create_content_table()
        #except:
            #print('Cannot load specified file!\nError in main.load()')
            #pass

    def load_from_lineEdit(self):
        # Load from LineEdit
        try:
            self.fname[0] = self.ui.edit_database_path.text()[17:] # get text without prefix
            self.Kochbuch = SqliteDict(self.fname[0], autocommit=True)  # Lade Haupt dictionary/ Gerichte
            self.create_content_table()
        except:
            print('Cannot load specified file!\nError in main.load_from_lineEdit()')
            pass

    def save(self):
        # get items from content_table, update Kochbuch and commit it to database
        self.Kochbuch.clear()
        table = self.ui.content_table
        header_items = [table.model().headerData(i, Qt.Horizontal) for i in range(table.model().columnCount())]

        for row_index in range(self.ui.content_table.rowCount()): # Every row is one dish/gericht
            temp_dict = dict()
            for col_index, item in enumerate(header_items):
                temp_dict[item] = table.cellWidget(row_index,col_index).text()
            self.add_gericht(temp_dict)

    def add_entry(self):
        # Add empty entry to table
        row_cnt = self.ui.content_table.rowCount()
        col_cnt = self.ui.content_table.columnCount()
        self.ui.content_table.insertRow(row_cnt)

        for col_index in range(col_cnt):
            self.ui.content_table.setCellWidget(row_cnt, col_index, QLineEdit())
            if col_index == col_cnt - 1:  # Delete Option
                self.ui.content_table.setCellWidget(row_cnt, col_index, QPushButton('Delete'))
                self.ui.content_table.cellWidget(row_cnt, col_index).clicked.connect(self.remove_entry)

    def remove_entry(self):
        table = self.ui.content_table
        # --------------Remove Row------------
        column = table.currentColumn()
        row = table.currentRow()
        table.removeRow(row)
        # -------------Remove dict entry--------
        #name = table.cellWidget(row,0).text()
        #self.del_gericht(name)

    def create_content_table(self):
        # Creates the Widgets inside the Table
        table = self.ui.content_table
        table.setRowCount(len(self.Kochbuch))
        header_items = [table.model().headerData(i, Qt.Horizontal) for i in range(table.model().columnCount())]
        row_label = []
        col_cnt = table.model().columnCount()

        for row_index, val in enumerate(self.Kochbuch.items()):
            #row_label.append(str(row_index + 1) + ' ' + str(val[0]))
            for col_index in range(col_cnt):
                table.setCellWidget(row_index,col_index,QLineEdit())
                if col_index == col_cnt - 1: # Add Delete Button
                    table.setCellWidget(row_index, col_index, QPushButton('Delete'))
                    table.cellWidget(row_index,col_index).clicked.connect(self.remove_entry)
        #self.ui.content_table.setVerticalHeaderLabels(row_label)
        self.set_text_to_table(header_items)

    def set_text_to_table(self,header_items):
        table = self.ui.content_table
        for row_index, val in enumerate(self.Kochbuch.items()):
            table.cellWidget(row_index, 0).setText(val[0]) # Name column/ set Name
            #print(val[1].values())
            for col_index, item in enumerate(header_items[1:]):
                try:
                    table.cellWidget(row_index, col_index + 1).setText(val[1][item])
                except KeyError:
                    if item == None: # Used, that the delete button text will not be overwritten
                        pass
                    else:
                        # Set unfilled category empty
                        table.cellWidget(row_index, col_index + 1).setText('')

    def add_gericht(self, entries:dict):
        # Olf func args: name: str, Fisch: bool, Nudeln: bool, Vortag: bool, SE: bool, WE: bool, WE_Wichtung: float, Wichtung: float

        # Gerichte werden gespeichert in dict()
        # Jedes Gericht wird dabei Kategorisiert in:
        #       - Fisch: Bool
        #       - Nudeln: Bool
        #       - Vortag: Bool (Wenn Sonntags viel Bestellt wird das Essen vom Vortag machen)
        #       - SE: Bool (Sonntagsessen)
        #       - WE: Bool (Wochenendessen, wie z.B Holen/Bestellen)
        #       - WE_Wichtung: Float (Jedes Gericht soll die chance haben am WE dran zu kommen, Holen/Bestellen oder z.B. Rolladen sollen bevorzugt werden)
        #       - Wichtung: Float (Warkeit des Gerichtes ausgewählt zu werden, um doppelte zu vermeiden) 1.0 = Kommt dran; 0 = wird nicht dran kommen
        #       etc.
        # -------------------------------------------------------------------------------
        # Tortillas = dict() # Ein Gericht dict()
        # Tortillas['Fisch'] = False
        # Tortillas['Nudeln'] = False
        # Tortillas['Vortag'] = False
        # Tortillas['SE'] = False
        # Tortillas['WE'] = False
        # Tortillas['WE_Wichtung'] = 0.1
        # Tortillas['Wichtung'] = 1.0

        # Gerichte['Tortillas'] = Tortillas
        # -------------------------------------------------------------------------------
        '''name_dict = dict()
        name_dict['Fisch'] = Fisch
        name_dict['Nudeln'] = Nudeln
        name_dict['Vortag'] = Vortag
        name_dict['SE'] = SE
        name_dict['WE'] = WE
        name_dict['WE_Wichtung'] = WE_Wichtung
        name_dict['Wichtung'] = Wichtung'''
        name = entries['Name']
        self.Kochbuch[name] = entries

    def del_gericht(self,name):
        self.Kochbuch.pop(name)

    def update_kochbuch(self, Kochbuch: dict, name: str, kategorie: str, value: any):
        # Da hier mit dict im dict gearbeitet wird und dasa äußerste dict ein SQLdict ist,
        # müssen hier die Einträge ein bisschen umständlich verändert werden, um vom RAM in die SQL datei zu schreiben
        update = Kochbuch[name]
        update[kategorie] = value
        Kochbuch[name] = update

    def choose(self,dishes):
        choosed_dish = np.asarray(dishes)
        Wichtungen = choosed_dish[:,1].astype(np.float)
        #choosed_dish = np.sort(choosed_dish)

        # Find maximas in Wichtung column
        max_indizes = np.where(Wichtungen==np.amax(Wichtungen))
        finds = []
        for i in max_indizes[0]:
            finds.append(choosed_dish[i])
        # Choose dish
        # If len of finds > 1 use random int to choose dish
        if len(finds) > 1:
            dish_index = random.randint(0,len(finds) - 1)
            return finds[dish_index]
        else:
            return finds
        #print('test','\n',choosed_dish)


    def gen_week_table(self):
        # generate table of dishes day wise
        dishes = [i for i in self.Kochbuch.items()]
        dishes_cnt = len(dishes)

        usable_dishes = []
        possible_dishes_mon = []
        possible_dishes_tue = []
        possible_dishes_wed = []
        possible_dishes_thu = []
        possible_dishes_fri = []
        possible_dishes_sat = []
        possible_dishes_sun = []
        saison = 'Winter'

        for index, dish in enumerate(dishes):
            dish = dish[1]
            # Perform standard check, to reduce dishes according to seasons and weigth
            #if float(dish['Wichtung']) > 0.7 and  dish['Saison'] == saison:  # Standard check
            if float(dish['Wichtung']) > 0.7 and (dish['Saison'] == saison or dish['Saison'] == 'None'):
                usable_dishes.append(dish)

                # -----------Monday-------------
                # ------------------------------
                # Mondays should prefer Nudeln ---> Boni for Nudeln == True
                if dish['Fisch'] == 'False':
                    if dish['Nudeln'] == 'True':
                        possible_dishes_mon.append([dish['Name'], float(dish['Wichtung']) + 0.3])
                    else:
                        possible_dishes_mon.append([dish['Name'], float(dish['Wichtung'])])

               #-----------------------------------------------------------------------------

                # -----------Tuesday/Wednesday/Thursday-------------
                # --------------------------------------------------
                # Days without preferations
                if dish['Fisch'] == 'False' and dish['Vortag'] == 'False': # Standard check
                    possible_dishes_tue.append([dish['Name'], float(dish['Wichtung'])])
                    possible_dishes_wed.append([dish['Name'], float(dish['Wichtung'])])
                    possible_dishes_thu.append([dish['Name'], float(dish['Wichtung'])])

                # -----------------------------------------------------------------------------

                # -----------Friday-------------
                # ------------------------------
                # Fish prefered
                if dish['WE'] == 'True' and dish['SE'] == 'False':
                    if dish['Fisch'] == 'True':
                        possible_dishes_fri.append([dish['Name'], float(dish['Wichtung']) + 0.3])
                    else:
                        possible_dishes_fri.append([dish['Name'], float(dish['Wichtung'])])

                # -----------------------------------------------------------------------------

                # -----------Saturday-------------
                # --------------------------------
                # WE category prefered
                if dish['Fisch'] == 'False' and dish['SE'] == 'False' and dish['Vortag'] == 'False':
                    if dish['WE'] == 'True':
                        possible_dishes_sat.append([dish['Name'], float(dish['Wichtung']) + 0.3])
                    else:
                        possible_dishes_sat.append([dish['Name'], float(dish['Wichtung'])])

                # -----------------------------------------------------------------------------

                # -----------Sunday-------------
                # ------------------------------
                # SE highly prefered
                if dish['Fisch'] == 'False' and dish['Vortag'] == 'False':
                    if dish['SE'] == 'True':
                        possible_dishes_sun.append([dish['Name'], float(dish['Wichtung']) + 0.5])
                    else:
                        possible_dishes_sun.append([dish['Name'], float(dish['Wichtung'])])

        print('============================================================================')
        print('=================================Wochenplan=================================')
        print('============================================================================')
        print('Monday:  ', self.choose(possible_dishes_mon)[0])
        print('----------------------------------------------------------------------------')
        print('Tuesday: ', self.choose(possible_dishes_tue)[0])
        print('----------------------------------------------------------------------------')
        print('Wednesday: ', self.choose(possible_dishes_wed)[0])
        print('----------------------------------------------------------------------------')
        print('Thurday: ', self.choose(possible_dishes_thu)[0])
        print('----------------------------------------------------------------------------')
        print('Friday: ', self.choose(possible_dishes_fri)[0])
        print('----------------------------------------------------------------------------')
        print('Saturday: ', self.choose(possible_dishes_sat)[0])
        print('----------------------------------------------------------------------------')
        print('Sunday: ', self.choose(possible_dishes_sun)[0])
        print('============================================================================')


        #print(self.choose(possible_dishes_mon)[0])
        Speiseplan = collections.OrderedDict()