def extract_instances(self, train_instances):
        """ extract features to features """
        self.extract_information(train_instances)

        features = []
        infos = []
        process_bar = pyprind.ProgPercent(len(train_instances))

        ''' get features from train instances'''

        alignment_feature_file = self.feature_file.replace('IdfAlignmentFeature', 'AlignmentFeature')
        alignment_features = utils.create_read_file(alignment_feature_file).readlines()

        idf_weight = self.idf_weight
        default_idf_weight = min(idf_weight.values())

        for train_instance, alignment_feature in zip(train_instances, alignment_features[1:]):
            process_bar.update()

            alignment_feature = alignment_feature.split('\t#\t')[1]
            myWordAlignments = json.loads(alignment_feature)[0]  # list of [sa_idx, sb_idx] index start from 1

            word_sa, word_sb = train_instance.get_word(type='lemma', lower=True)

            sa_aligned = [sa_idx - 1 for sa_idx, sb_idx in myWordAlignments]
            sb_aligned = [sb_idx - 1 for sa_idx, sb_idx in myWordAlignments]

            sent1_aligned = [0] * len(word_sa)
            sent2_aligned = [0] * len(word_sb)

            for sa_index in sa_aligned:
                sent1_aligned[sa_index] = 1

            for sb_index in sb_aligned:
                sent2_aligned[sb_index] = 1

            # calc all and aligned except stopwords
            sent1_sum = 0
            sent2_sum = 0
            sent1_ali = 0
            sent2_ali = 0
            for idx, word in enumerate(word_sa):
                weight = idf_weight.get(word, default_idf_weight)
                sent1_ali += sent1_aligned[idx] * weight
                sent1_sum += weight

            for idx, word in enumerate(word_sb):
                weight = idf_weight.get(word, default_idf_weight)
                sent2_ali += sent2_aligned[idx] * weight
                sent2_sum += weight
            feature = [1.0 * (sent1_ali + sent2_ali) / (sent1_sum + sent2_sum + 1e-6)]
            info = [sent2_ali, sent2_ali, sent1_sum, sent2_sum]
            features.append(feature)
            infos.append(info)

        return features, infos
Exemple #2
0
def get_random(in_file, p, to_file):

    process_bar = pyprind.ProgPercent(12440969)
    with open(in_file) as fin, \
         open(to_file, "w") as fout:

        for line in fin:
            process_bar.update()
            if random.random() < p:
                fout.write(line)
def load_word_embedding_std(in_vocab, emb_file, n_dim=300):
    """
    load word embedding in a standard way:
    Args:
        in_vocab:
        emb_file:
        n_dim:
    Returns:
        word2index, embeddings
    """
    word2index = {}
    embeddings = []
    word2index[pad_word] = 0
    embeddings.append(np.zeros(n_dim))
    word2index[unk_word] = 1
    embeddings.append(np.random.uniform(-0.25, 0.25, (n_dim, )))
    word_index = 2
    print('Load word embedding: %s' % emb_file)

    total_line = get_embedding_file_len(emb_file)
    process_bar = pyprind.ProgPercent(total_line)

    with open(emb_file, 'r', errors='ignore') as f:
        # with open(emb_file, 'r') as f:
        for idx, line in enumerate(f):
            process_bar.update()
            # if the first line is (vocab, ndim)
            if idx == 0 and len(line.split()) == 2:
                continue
            # split the line
            sp = line.rstrip().split()
            if len(sp) != n_dim + 1:
                print(sp[0:len(sp) - n_dim])

            w = ''.join(sp[0:len(sp) - n_dim])
            emb = [float(x) for x in sp[len(sp) - n_dim:]]
            assert len(emb) == n_dim
            if w in in_vocab and w not in word2index:
                word2index[w] = word_index
                embeddings.append(emb)
                word_index += 1

    pre_trained_len = len(word2index)
    n_words = len(in_vocab)

    print('Pre-trained: {}/{} {:.2f}'.format(
        pre_trained_len, n_words, pre_trained_len * 100.0 / n_words))

    oov_word_list = [w for w in in_vocab if w not in word2index]
    print('oov word list example (30): ', oov_word_list[:30])
    pickle.dump(oov_word_list, open('./oov.p', 'wb'))

    embeddings = np.array(embeddings, dtype=np.float32)
    return word2index, embeddings
def main(topology):
    # the input to main is the path to the topology file
    # the output to this script saves two json files inside the downloaded tweets directory,
    # one json file has all the active users the other has all inactive users from the topology
    # user activity is based on status count and availabilty of tweets (public vs private) 
    #
    # this script can be stopped and started in the middle of running it without losing progress

    inactive_users = read_json('dnld_tweets/inactive_users.json')
    active_users = read_json('dnld_tweets/active_users.json')
    twpy_api = auth.get_access_creds()
    tweets_dir = './dnld_tweets/'

    # put every single user (non repeating) from the topology file into a set
    with open(topology, 'r') as inp_file:
        comm_set = set(user for community in inp_file for user in ast.literal_eval(community))

    # create directory for storing tweets
    if not os.path.exists(os.path.dirname(tweets_dir)):
        os.makedirs(os.path.dirname(tweets_dir), 0o755)

    # download tweets for every single user in the set
    # separate active users from inactive users based on status count and availability
    bar = pyprind.ProgPercent(len(comm_set), track_time=True, title='Downloading Tweets') 
    while comm_set:
        user = comm_set.pop()
        bar.update(item_id=str(user) + '\t')

        if str(user) in inactive_users or str(user) in active_users:
            continue

        # skip user if they don't exist or are inactive
        status_count = user_status_count(user, twpy_api)
        if status_count <= 10:
            inactive_users[str(user)] = status_count
            write_json(tweets_dir, active_users, inactive_users)
            continue

        # skip user if already downloaded their tweets
        if os.path.exists(os.path.join(tweets_dir, str(user))):
            active_users[str(user)] = status_count
            write_json(tweets_dir, active_users, inactive_users)
            continue

        tweets = get_tweets(user, twpy_api)

        if tweets:
            tweet_filename = tweets_dir + str(user)
            write_tweets(tweets, tweet_filename)
            active_users[str(user)] = status_count
        else:
            inactive_users[str(user)] = 0 

        write_json(tweets_dir, active_users, inactive_users)
def load_word_embedding_my(word2index, emb_file, n_dim=300):
    """
    load word embedding in a tricky way:
    obtain the embedding equal to the len(word2index)
    Args:
        word2index:
        emb_file:
        n_dim:

    Returns:
        word2index, embeddings
    """

    print('Load word embedding: %s' % emb_file)

    pre_trained = {}
    n_words = len(word2index)

    embeddings = np.random.uniform(-0.25, 0.25, (n_words, n_dim))
    embeddings[0, ] = np.zeros(n_dim)

    total_line = get_embedding_file_len(emb_file)
    process_bar = pyprind.ProgPercent(total_line)

    # with open(emb_file, 'r', errors='ignore') as f:
    with open(emb_file, 'r') as f:
        for idx, line in enumerate(f):
            process_bar.update()
            # if the first line is (vocab, ndim)
            if idx == 0 and len(line.split()) == 2:
                continue
            # split the line
            sp = line.rstrip().split()
            if len(sp) != n_dim + 1:
                print(sp[0:len(sp) - n_dim])

            w = ''.join(sp[0:len(sp) - n_dim])
            emb = [float(x) for x in sp[len(sp) - n_dim:]]

            if w in word2index and w not in pre_trained:
                embeddings[word2index[w]] = emb
                pre_trained[w] = 1

    pre_trained_len = len(pre_trained)

    print('Pre-trained: {}/{} {:.2f}'.format(
        pre_trained_len, n_words, pre_trained_len * 100.0 / n_words))

    oov_word_list = [w for w in word2index if w not in pre_trained]
    print('oov word list example (30): ', oov_word_list[:30])
    pickle.dump(oov_word_list, open('./oov.p', 'wb'))

    embeddings = np.array(embeddings, dtype=np.float32)
    return word2index, embeddings
def load_parse_data(train_file, parser=None, flag=False):
    """
    Load data after Parse, like POS, NER, etc.
    Value: [ SentPair:class, ... ]
    Parameter:
        flag: False(Default), Load from file (resources....)
              True, Parse and Write to file, and then load from file
    """
    ''' Pre-Define Write File '''

    # parse_train_file = config.PARSE_DIR + '/' + \
    #                    utils.FileManager.get_file(train_file)

    parse_train_file = train_file.replace('./data', './generate/parse')

    if flag or not os.path.isfile(parse_train_file):

        print(train_file)
        if parser is None:
            raise RuntimeError(
                "parser should be init by ``nlp = stst.StanfordNLP('http://localhost:9000')``"
            )
        ''' Parse Data '''
        data = load_STS(train_file)

        print('*' * 50)
        print("Parse Data, train_file=%s, n_train=%d\n" %
              (train_file, len(data)))

        parse_data = []
        process_bar = pyprind.ProgPercent(len(data))
        for (sa, sb, score) in data:
            process_bar.update()
            parse_sa = parser.parse(sa)
            parse_sb = parser.parse(sb)
            parse_data.append((parse_sa, parse_sb, score))
        ''' Write Data to File '''
        with utils.create_write_file(parse_train_file) as f_parse:
            for parse_instance in parse_data:
                line = json.dumps(parse_instance)
                print(line, file=f_parse)
    ''' Load Data from File '''
    print('*' * 50)
    parse_data = []
    with utils.create_read_file(parse_train_file) as f:
        for line in f:
            parse_json = json.loads(line)
            sentpair_instance = SentPair(parse_json)
            parse_data.append(sentpair_instance)

    print("Load Data, train_file=%s, n_train=%d\n" %
          (train_file, len(parse_data)))
    return parse_data
def example_2():
    n = 1000000
    my_per = pyprind.ProgPercent(n,
                                 stream=1,
                                 track_time=True,
                                 title='My Percent Indicator',
                                 monitor=True)
    for i in range(n):
        # do some computation
        my_per.update()
    print('\n\nPrint tracking object ...\n')
    print(my_per)
Exemple #8
0
def train(model, train_data_loader, optimizer_model, optimizer_step, criterion,
          use_cuda, epoch):

    totalloss_meter = AverageMeter()
    psnr_meter = AverageMeter()
    # hloss_meter = AverageMeter()

    # switch to train mode
    model.train()
    # progress bar init
    bar = pyprind.ProgPercent(len(train_data_loader), update_interval=1)
    tic = time.time()
    for batch, (batch_data) in enumerate(train_data_loader):

        batch_data = batch_data.cuda()

        # Compute gradient and do optimizer step
        optimizer_model.zero_grad()
        # Compute output
        predictions, encoded = model(batch_data)

        # hloss = criterion(batch_data, encoded)

        # Calculate loss
        total_loss = criterion(batch_data, predictions)  # + hloss
        #, L1E, KLD, CCE, ACC1, ACC2

        total_loss.backward()

        optimizer_model.step()
        if optimizer_step is not None: optimizer_step.step()

        toc = time.time() - tic
        tic = time.time()

        psnr_ = psnr_metric(batch_data, predictions)

        # Metrics
        totalloss_meter.update(total_loss.data.cpu().item(),
                               batch_data.shape[0])
        psnr_meter.update(psnr_.data.cpu().item(), batch_data.shape[0])
        # hloss_meter.update(hloss.data.cpu().item(), batch_data.shape[0])
        # Update log progress bar
        log_ = ' loss:' + '{0:4.4f}'.format(totalloss_meter.avg)
        log_ += ' psnr:' + '{0:4.4f}'.format(psnr_meter.avg)
        # log_ += ' hloss:'+ '{0:4.4f}'.format(hloss_meter.avg)
        log_ += ' batch time:' + '{0:2.3f}'.format(toc)
        bar.update(item_id=log_)

        del total_loss, batch_data, predictions

    return totalloss_meter.avg, psnr_meter.avg
def parse_data(data):
    """parse data
    Returns:
        outputs: list of (parse_sa, parse_sb, score)
    """
    outputs = []
    process_bar = pyprind.ProgPercent(len(data), title='parse the data')
    for example in data:
        process_bar.update()
        sa, sb, score = example
        parse_sa, parse_sb = nlp.parse(sa), nlp.parse(sb)
        outputs.append((parse_sa, parse_sb, score))
    return outputs
def tagging_train_data(in_file, to_file, statistics_file):

    dict_P_to_count = {}

    NUM_NEAGTIVE = 100000

    process_bar = pyprind.ProgPercent(1175201)
    with open(in_file) as fin, \
         open(to_file, "w") as fout:

        for line in fin:
            process_bar.update()

            line_list = line.strip().split("\t")
            dict_label_info_string = line_list[-1]
            dict_label_info = ujson.loads(dict_label_info_string)

            # 加载每个P的正负样本:
            for P in dict_label_info:

                if P not in dict_P_to_count:
                    dict_P_to_count[P] = {}
                    dict_P_to_count[P]["positive"] = 0
                    dict_P_to_count[P]["negative"] = 0

                for s_o in dict_label_info[P]["candidates"]:
                    label = dict_label_info[P]["candidates"][s_o]["label"]

                    #  正例
                    if label > 0:
                        dict_P_to_count[P]["positive"] += 1
                        dict_label_info[P]["candidates"][s_o]["wanted"] = 1
                    else:
                        # 负例
                        if label < 0 and dict_P_to_count[P][
                                "negative"] < NUM_NEAGTIVE:
                            dict_P_to_count[P]["negative"] += 1

                            dict_label_info[P]["candidates"][s_o]["wanted"] = 1
                        else:
                            dict_label_info[P]["candidates"][s_o]["wanted"] = 0

            out_line_list = line_list[:-1] + [
                ujson.dumps(dict_label_info, ensure_ascii=False)
            ]
            fout.write("\t".join(out_line_list) + "\n")

    with open(statistics_file, "w") as fout:
        for P in dict_P_to_count:
            fout.write("%s\t%d\t%d\n" % (P, dict_P_to_count[P]["positive"],
                                         dict_P_to_count[P]["negative"]))
Exemple #11
0
def upload_to_s3(s3_path, file_name=None, file_text=None, acl='private'):
    """Uploads a file to S3

    Args:
        s3_path(S3Path): Output path of the file to be uploaded
        file_name(str): Name of the file to be uploaded to s3
        file_text(str): Contents of the file to be uploaded
        acl(str): ACL policy of the file on S3
    """
    if not isinstance(s3_path, S3Path):
        raise ETLInputError('Input path should be of type S3Path')

    if not any([file_name, file_text]):
        raise ETLInputError('File_name or text should be given')

    if file_name:
        source_size = os.stat(file_name).st_size
    else:
        source_size = len(file_text)
    bar = None
    cb = None

    bucket = get_s3_bucket(s3_path.bucket)
    if s3_path.is_directory:
        key_name = os.path.join(s3_path.key, os.path.basename(file_name))
    else:
        key_name = s3_path.key

    key = bucket.new_key(key_name)
    if file_name:
        if source_size > LARGE_FILE_LIMIT:
            _multipart_upload(bucket, key_name, file_name)
        else:
            if source_size > CHUNK_SIZE:
                bar = pyprind.ProgPercent(PROGRESS_SECTIONS,
                                          monitor=True,
                                          title='Uploading %s' % file_name)

                def _callback(current, total):
                    bar.update()

                cb = _callback
            key.set_contents_from_filename(file_name,
                                           cb=cb,
                                           num_cb=PROGRESS_SECTIONS,
                                           policy=acl)
    else:
        key.set_contents_from_string(file_text,
                                     cb=cb,
                                     num_cb=PROGRESS_SECTIONS,
                                     policy=acl)
def aligner(parsed_data):
    """aligner
        sim(sa, sb) = \frac{sa_{aligned} + sb_{aligned}}{sa_{all} + sb_{all}}
    """
    preds = []
    golds = []
    process_bar = pyprind.ProgPercent(len(parsed_data))
    for example in parsed_data:
        process_bar.update()
        parse_sa, parse_sb, score = example
        features, infos = align_feats(parse_sa, parse_sb)
        preds.append(features[0])
        golds.append(score)
    return preds, golds
def load_embed_from_text(emb_file, n_dim):
    """
    load_embedding from raw text
    Args:
        emb_file:
        n_dim:

    Returns:
        word2index: dict
        embeddings: numpy
    """
    print('Load word embedding: %s' % emb_file)

    word2index = {}
    embeddings = []

    word2index[pad_word] = 0
    embeddings.append(np.zeros(n_dim))
    word2index[unk_word] = 1
    embeddings.append(np.random.uniform(-0.25, 0.25, (n_dim, )))
    word_index = 2

    total_line = get_embedding_file_len(emb_file)
    process_bar = pyprind.ProgPercent(total_line)

    with open(emb_file, 'r') as f:
        for idx, line in enumerate(f):
            process_bar.update()
            # if the first line is (vocab, ndim)
            if idx == 0 and len(line.split()) == 2:
                print('embedding info: ', line)
                continue

            # split the line
            sp = line.rstrip().split()
            if len(sp) != n_dim + 1:
                print(sp[0:len(sp) - n_dim])

            w = ''.join(sp[0:len(sp) - n_dim])
            emb = [float(x) for x in sp[len(sp) - n_dim:]]

            word2index[w] = word_index
            embeddings.append(emb)
            word_index += 1

    embeddings = np.array(embeddings, dtype=np.float32)
    print('finished load input embed!!')

    return word2index, embeddings
Exemple #14
0
def get_dict_author_keyword_freq(dict_author_keywords_path, to_file):
    dict_author_keywords_freq = {}
    dict_author_keywords = json.load(open(dict_author_keywords_path),
                                     encoding="utf-8")
    bar = pyprind.ProgPercent(len(dict_author_keywords))
    for autherId in dict_author_keywords:
        if autherId not in dict_author_keywords_freq:
            dict_author_keywords_freq[autherId] = Counter()
        keywords = dict_author_keywords[autherId]
        for keyword in keywords:
            dict_author_keywords_freq[autherId][keyword] += 1
    bar.update()

    print "dump..."
    json.dump(dict_author_keywords_freq, open(to_file, "w"), encoding="utf-8")
    def extract_instances(self, train_instances):
        """ extract features to features """

        # first extract information from train_instance
        # for only be used to extract data_set information and can reuse the pyprind
        self.extract_information(train_instances)
        features = []
        infos = []
        process_bar = pyprind.ProgPercent(len(train_instances))
        for train_instance in train_instances:
            process_bar.update()
            feature, info = self.extract(train_instance)  ##可变参数进行传递!
            features.append(feature)
            infos.append(info)
        return features, infos
Exemple #16
0
def load_parse_data(train_file, nlp=None, flag=False):
    """
    Load data after Parse, like POS, NER, etc.
    Value: [ SentPair:class, ... ]
    Parameter:
        flag: False(Default), Load from file (resources....)
              True, Parse and Write to file, and then load from file
    """
    ''' Pre-Define Write File '''

    # parse_train_file = config.PARSE_DIR + '/' + \
    #                    utils.FileManager.get_file(train_file)

    parse_train_file = train_file.replace('./data', './generate/parse')

    if flag or not os.path.isfile(parse_train_file):

        print(train_file)
        ''' Parse Data '''
        data = load_data(train_file)

        print('*' * 50)
        print("Parse Data, train_file=%s, n_train=%d\n" %
              (train_file, len(data)))

        parse_data = []
        process_bar = pyprind.ProgPercent(len(data))
        for (sent, label) in data:
            process_bar.update()
            sent = preprocess(sent)
            parse_data.append((sent, label))
        ''' Write Data to File '''
        with utils.create_write_file(parse_train_file) as f_parse:
            for parse_instance in parse_data:
                line = json.dumps(parse_instance, ensure_ascii=False)
                print(line, file=f_parse)
    ''' Load Data from File '''
    print('*' * 50)
    parse_data = []
    with utils.create_read_file(parse_train_file) as f:
        for line in f:
            sent, label = json.loads(line)
            sentpair_instance = Sent(sent, label)
            parse_data.append(sentpair_instance)

    print("Load Data, train_file=%s, n_train=%d\n" %
          (train_file, len(parse_data)))
    return parse_data
Exemple #17
0
def train(model, train_data_loader, optimizer_model, optimizer_step, criterion,
          use_cuda, epoch):

    totalloss_meter = AverageMeter()
    psnr_meter = AverageMeter()

    # switch to train mode
    model.train()
    # progress bar init
    bar = pyprind.ProgPercent(len(train_data_loader), update_interval=1)
    tic = time.time()
    for batch, (input_image) in enumerate(train_data_loader):

        input_image = input_image.cuda()

        # Compute gradient and do optimizer step
        optimizer_model.zero_grad()
        # Compute output
        recon_image, encode_master, encode_student = model(input_image)

        # Calculate loss
        total_loss = 0.01 * criterion(input_image, recon_image) +\
         criterion(encode_master, encode_student)#, target_var
        #, L1E, KLD, CCE, ACC1, ACC2

        total_loss.backward()

        optimizer_model.step()
        if optimizer_step is not None: optimizer_step.step()

        toc = time.time() - tic
        tic = time.time()

        psnr_ = psnr_metric(input_image, recon_image)

        # Metrics
        totalloss_meter.update(total_loss.data.cpu().item(),
                               input_image.shape[0])
        psnr_meter.update(psnr_.data.cpu().item(), input_image.shape[0])
        # Update log progress bar
        log_ = ' loss:' + '{0:4.4f}'.format(totalloss_meter.avg)
        log_ += ' psnr:' + '{0:4.4f}'.format(psnr_meter.avg)
        log_ += ' batch time:' + '{0:2.3f}'.format(toc)
        bar.update(item_id=log_)

        del total_loss, input_image, recon_image, encode_master, encode_student

    return totalloss_meter.avg, psnr_meter.avg
def main(topology):
    inactive_users = read_json('dnld_tweets/inactive_users.json')
    active_users = read_json('dnld_tweets/active_users.json')
    _, app_auths = auth.get_access_creds()
    tweets_dir = './dnld_tweets/'

    with open(topology, 'r') as inp_file:
        comm_set = set(user for community in inp_file
                       for user in ast.literal_eval(community))

    if not os.path.exists(os.path.dirname(tweets_dir)):
        os.makedirs(os.path.dirname(tweets_dir), 0o755)

    bar = pyprind.ProgPercent(len(comm_set),
                              track_time=True,
                              title='Downloading Tweets')
    while comm_set:
        user = comm_set.pop()
        bar.update(item_id=user)

        if str(user) in inactive_users:
            continue

        api = auth.manage_auth_handlers(app_auths)

        # skip user if they don't exist or are inactive
        status_count = user_status_count(user, api)
        if status_count <= 10:
            inactive_users[str(user)] = status_count
            write_json(tweets_dir, active_users, inactive_users)
            continue

        # skip user if you've already downloaded their tweets
        if os.path.exists(os.path.join(tweets_dir, str(user))):
            active_users[str(user)] = status_count
            write_json(tweets_dir, active_users, inactive_users)
            continue

        tweets = get_tweets(user, api)

        if tweets:
            tweet_filename = tweets_dir + str(user)
            write_tweets(tweets, tweet_filename)
            active_users[str(user)] = status_count
        else:
            inactive_users[str(user)] = 0

        write_json(tweets_dir, active_users, inactive_users)
def get_user_followers(twpy_api, user_ids):
    # returns the followers of each user {user: [followers]} and also updates/returns user ids
    followers = user_ids
    user_followers = {}
    bar = pyprind.ProgPercent(len(user_ids),
                              track_time=True,
                              title='Finding user followers')
    for user in user_ids:
        bar.update(item_id=str(user) + '\t')
        try:  # protected tweets or user doesn't exist
            user_followers[user] = twpy_api.followers_ids(id=user)
            followers.extend(user_followers[user])
        except:
            print("Skipping user: " + str(user))

    return set(followers), user_followers
Exemple #20
0
def get_top_k_coauthors(paper_author_path, k, to_file):

    data = util.read_dict_from_csv(paper_author_path)

    dict_paperId_to_authors = {}
    bar = pyprind.ProgPercent(len(data))
    for item in data:
        paperId = int(item["PaperId"])
        authorId = int(item["AuthorId"])
        if paperId not in dict_paperId_to_authors:
            dict_paperId_to_authors[paperId] = []
        dict_paperId_to_authors[paperId].append(authorId)
        bar.update()

    print "dump..."
    json.dump(dict_paperId_to_authors, open(to_file, "w"), encoding="utf-8")
    def make_frames(z):
        files = []
        tmpdir = tempfile.mkdtemp()

        if verbose:
            print('Saving sequence ' + filename + ' as a ' +  vext + ' format')
            pbar = progressbar.ProgPercent(N_frame, monitor=True)
        for frame in range(N_frame):
            if verbose: pbar.update()
            fname = 'frame%06d.png' % frame
            full_fname = os.path.join(tmpdir, fname)
            image = np.rot90(z[..., frame])
            imageio.imsave(full_fname, (image*255).astype(np.uint8), compression=0, quantize=256)
            files.append(fname)

        if verbose: print(pbar)
        return tmpdir, files
Exemple #22
0
def aligning_documents_by_interlanguage_links(source_corpus_file,
                                              target_corpus_file,
                                              source_language, target_language,
                                              output_path):

    if not output_path.endswith('/'): output_path = output_path + '/'
    check_dir(output_path)  # if directory does not exist, then create

    logging.info(
        'aliging %s and %s wikipeida documents using interlanguage links',
        source_language, target_language)
    source_docs = split_wikipedia_docs_into_array(source_corpus_file)
    logging.info('source corpus is loaded')
    target_docs = split_wikipedia_docs_into_array(target_corpus_file)
    logging.info('target corpus is loaded')

    target_titles = [
        get_title_from_interlanguage_links(d, source_language)
        for d in target_docs
    ]

    logging.info('start aligning...')
    source_out = open(output_path + source_language + '-wiki.txt', 'w')
    target_out = open(output_path + target_language + '-wiki.txt', 'w')
    count = 1

    my_prperc = pyprind.ProgPercent(len(source_docs))

    for i in range(len(source_docs)):
        my_prperc.update()  # print progress
        source_title = get_title_from_interlanguage_links(
            source_docs[i], source_language)
        try:
            index = target_titles.index(source_title)
            text_out = source_docs[i]
            print(text_out.encode('utf-8'), file=source_out)
            text_out = target_docs[index]
            print(text_out.encode('utf-8'), file=target_out)
            count += 1
        except:
            continue

    logging.info(
        'aliging by document interlanguage links is done! ... \n %d documents are aligned',
        count)
Exemple #23
0
def img_to_jpeg(img_path):
    """
    Transform every image in path into a png one for making it compatible with TF pre-trained NN
    :param img_path: path to image folder
    :return: None. It writes images on disk
    """
    img_found_counter = 0
    img_converted_counter = 0
    if not os.path.isdir(img_path):
        raise InputError('{} is not a valid path'.format(img_path))
    for (gen_path, bmp_paths, img_names) in os.walk(img_path):
        bar = pyprind.ProgPercent(len(img_names))
        # print(gen_path, bmp_paths, img_names)
        for file_name in img_names:
            if not file_name.endswith(IMAGES_EXTENSION):
                file_no_extension = os.path.splitext(file_name)[0]
                # file_no_extension = file_name.replace('.bmp', '')
                img_found_counter += 1
                # if (file_no_extension + IMAGES_EXTENSION) not in img_names:
                if True:
                    logger.info('Now processing: {}'.format(file_name))
                    file_path = os.path.join(gen_path, file_name)
                    if not os.path.isfile(file_path):
                        raise InputError(
                            '{} is not a valid image'.format(file_path))
                    with Image.open(file_path) as img:
                        img = img.convert('L')
                        img = uglify_image(img)
                        # path is valid as it has been checked before
                        img.save(os.path.join(
                            gen_path, file_no_extension + IMAGES_EXTENSION),
                                 IMAGES_EXTENSION.replace('.', ''),
                                 dpi=(DPI_EXTRACTION, DPI_EXTRACTION))
                        img_converted_counter += 1
                        logger.info('{} succesfully written on disk!'.format(
                            file_name))
            bar.update()
    if img_found_counter == 0:
        logger.warning('No img to convert found!')
    else:
        if img_converted_counter == 0:
            logger.info('No img to convert left!')
Exemple #24
0
def get_data(server, dataUrls, category, dataset, dbconn):
    data = []
    category_keys = [str(a) for a in category.keys()]
    percentage = pyprind.ProgPercent(len(dataUrls))  #Progress bar
    for dataUrl in dataUrls:
        response = server.get(dataUrl).json()
        if 'dataValues' in response.keys():
            values = response['dataValues']
            for value in values:
                categoryOptionCombo = value['categoryOptionCombo'].lower()
                if 'value' in value.keys(
                ) and categoryOptionCombo in category_keys:
                    data.append([
                        value['orgUnit'], value['period'],
                        value['dataElement'], category[categoryOptionCombo],
                        value['value']
                    ])
        #Update progress bar
        percentage.update()
    return data
Exemple #25
0
def prepare_author_keywords(aid_pid_path, paper_path):
    aid_pid = util.read_dict_from_csv(aid_pid_path)
    paper = util.read_dict_from_csv(paper_path)
    print 'finish loading csv file'
    max_aid, max_pid = 0, 0
    for item in aid_pid:
        max_aid = max(max_aid, int(item["AuthorId"]))
        max_pid = max(max_pid, int(item['PaperId']))
    for item in paper:
        max_pid = max(max_pid, int(item["Id"]))
    print 'max_aid', max_aid, 'max_pid', max_pid

    paper_indexd_list = range(max_pid + 1)
    for item in paper:
        paper_indexd_list[int(item["Id"])] = item
    del paper

    keywords = [[]] * (max_aid + 1)
    bar = pyprind.ProgPercent(len(aid_pid))
    for item in aid_pid:
        bar.update()
        aid = int(item['AuthorId'])
        pid = int(item['PaperId'])
        if pid != paper_indexd_list[pid]:
            kw = paper_indexd_list[pid]["Keyword"]
            if kw:
                keywords[aid].append(kw)

    author_keywords = []
    for index, item in enumerate(keywords):
        dic = {}
        if item:
            dic["AuthorId"] = str(index)
            dic["Keywords"] = " ".join(item)
            author_keywords.append(dic)

    del keywords
    write_dict_to_csv(['AuthorId', 'Keywords'], author_keywords,
                      config.AUTHOR_KEYWORDS_FILE)
    print 'finish writing author_keywords csv in', config.AUTHOR_KEYWORES_FILE
Exemple #26
0
def evalModels(models, test_loader, testing_mode=False, return_y=False):
    test_correct = {key: 0.0 for key in models}
    if return_y:
        y_pred = {key: torch.Tensor([]).long() for key in models}
        y_true = torch.Tensor([]).long()
    bar = pyprind.ProgPercent(len(test_loader.dataset),
                              title="Testing epoch : ")
    for model in models.values():
        model.train(testing_mode)
    with torch.no_grad():
        for idx, data in enumerate(test_loader):
            x, y = data
            inputs = x.to(device)
            labels = y.to(device)

            if return_y:
                y_true = torch.cat((y_true, y.long().view(-1)))

            for key, model in models.items():
                outputs = model(inputs)

                test_correct[key] += (torch.max(
                    outputs, 1)[1] == labels.long().view(-1)).sum().item()

                if return_y:
                    y_pred[key] = torch.cat(
                        (y_pred[key],
                         torch.max(outputs,
                                   1)[1].to(torch.device('cpu')).long()))

            bar.update(test_loader.batch_size)
            #clear_output(wait=True)
            #print('Testing batch : {:.3f} %'.format(
            #    ((idx+1)*test_loader.batch_size*100) / len(test_loader.dataset)
            #))
    if return_y:
        return test_correct, y_true, y_pred
    else:
        return test_correct
Exemple #27
0
def get_1w_positive_5w_negative(in_file, to_file):

    fout = open(to_file, "w")

    count = {}
    process_bar = pyprind.ProgPercent(3500000)
    for line in open(in_file):
        process_bar.update()

        wanted = False
        label_info = json.loads(line.split("\t")[-1])

        for P in label_info:

            if P not in count:
                count[P] = {}
                count[P]["positive"] = 0
                count[P]["negative"] = 0
                count[P]["NULL"] = 0

            for so in label_info[P]["candidates"]:

                label = label_info[P]["candidates"][so]["label"]

                if label > 0 and count[P]["positive"] < 10000:
                    wanted = True
                    count[P]["positive"] += 1

                if label < 0 and count[P]["negative"] < 50000:
                    wanted = True
                    count[P]["negative"] += 1

                if label == 0:
                    count[P]["NULL"] += 1

        if wanted:
            fout.write(line)

    fout.close()
Exemple #28
0
def get_dict_auther_keywords(paper_path, paper_author_path, k, to_file):

    data_paper = util.read_dict_from_csv(paper_path)
    dict_paper_author = json.load(open(paper_author_path), encoding="utf-8")
    #print(dict_paper_author["1048576"])
    dict_auther_keywords = {}
    print("start...")
    bar = pyprind.ProgPercent(len(data_paper))
    for item in data_paper:
        paperId = int(item["Id"])
        title = item["Title"]
        keywords = item["Keyword"]
        key = util.get_string_splited(title + " " + keywords)

        for authorId in dict_paper_author[str(paperId)]:
            if authorId not in dict_auther_keywords:
                dict_auther_keywords[authorId] = []
            dict_auther_keywords[authorId].extend(key)
        bar.update()

    print "dump..."
    json.dump(dict_auther_keywords, open(to_file, "w"), encoding="utf-8")
Exemple #29
0
    def export(self, filename=None, **kwargs):
        if not filename:
            raise Exception('The "filename" parameter is required to export.')

        filename = os.path.expanduser(filename)

        from solvebio import Dataset

        result_count = len(self.query)
        if result_count <= 0:
            raise AttributeError('No results found in query!')

        self.rows = []
        self.key_map = OrderedDict()
        self.key_types = {}

        for f in Dataset.retrieve(self.query._dataset_id).fields(limit=1000):
            name = f['name']
            splits = [int(s) if s.isdigit() else s for s in name.split('.')]
            self.key_map[name] = splits
            self.key_types[name] = f['data_type']

        title = 'Exporting query to: {0}'.format(filename)

        if self.show_progress:
            progress_bar = pyprind.ProgPercent(result_count,
                                               title=title,
                                               track_time=False)
        else:
            print(title)

        for ind, record in enumerate(self.query):
            row = self.process_record(record)
            self.rows.append(row)
            if self.show_progress:
                progress_bar.update()

        self.write(filename=filename)
        print('Export complete!')
Exemple #30
0
def encode_word_dictionary(input_files, output_file):
    word_file_name, alphabet_file_name = input_files

    with open(alphabet_file_name) as alphabet_file:
        alphabet = json.loads(alphabet_file.read())

    with open(word_file_name) as word_file:
        words = [word.rstrip() for word in word_file]

    progress_bar = pyprind.ProgPercent(len(words))

    encoded_words = []
    for word in words:
        encoded_word = [alphabet[c] for c in word]
        encoded_word = [alphabet['START']] + encoded_word + [alphabet['END']]

        encoded_words.append(encoded_word)

        progress_bar.update()

    with open(output_file, 'w') as f:
        f.write(json.dumps(encoded_words))