Example #1
0
def evaluate_single_pred(url, url2, decoder_length=8):
    cr = Crawling()
    data = utils.load_file(url)
    if type(data) is list:
        data = np.asarray(data)
    lt = data.shape[0] * data.shape[1]
    data = np.reshape(data, (lt, 25))
    dtl = len(data)
    labels = utils.load_file(url2)
    labels = np.asarray(labels)
    loss_mae = 0.0
    loss_rmse = 0.0
    r2_total = 0.0
    for i, d in enumerate(data):
        pred_t = np.asarray(d).flatten()
        lb_i = i * pr.strides + 24
        lbt = labels[lb_i:(lb_i + decoder_length), :, 0]
        lbg = lbt[decoder_length - 1, :].flatten()
        mae, mse, r2 = get_evaluation(pred_t, lbg)
        loss_mae += mae
        loss_rmse += mse
        r2_total += r2
        utils.update_progress((i + 1.0) / dtl)
    loss_mae = loss_mae / lt * 300
    loss_rmse = sqrt(loss_rmse / lt) * 300
    r2_total = r2_total / lt
    print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae)))
    print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse)))
    print("R2 score: %.6f" % r2_total)
Example #2
0
def main(limit=50):#len(ALL)):
# def main(limit=len(ALL)):
    d = [a for a in (ALL.items())[:limit]]
    total = float(limit)
    out = []
    i = 0
    record_count = 0
    print 'Exporting patrons...'
    ft = open(export_dir+'PATRONS.marc.txt', 'w')
    fb = open(export_dir+'PATRONS.marc.dat', 'wb')
    for (recid, record) in d:
        i = i + 1
        if 'EXPIR' in record and record['EXPIR']=='None':
            expiration_date = datetime.strptime('9999-1-1', '%Y-%m-%d')
        elif 'EXPIR' in record:
            try:
                expiration_date = datetime.strptime(format_date(record['EXPIR'], '%Y-%m-%d %H:%M:%S'), '%Y-%m-%d')
            except:
                expiration_date = datetime.strptime(format_date(record['EXPIR'], '%m/%d/%Y'), '%Y-%m-%d')
        if not 'EXPIR' in record or ('EXPIR' in record and expiration_date > datetime.now()):
            if 'PA' not in record or ('PA' in record and len(record['PA'])!=3):
                rec_binary = format_record(recid, record)
                fb.write(rec_binary.as_marc())
                ft.write(str(rec_binary) + '\n==================\n')
                record_count = record_count + 1
            if i > limit:
                break
        update_progress(i*100/total)
    fb.close()
    ft.close()
    print "\nPatrons exported: %d/%d" % (record_count, limit)
Example #3
0
def main(limit=len(ALL)):
    keep_history = False
    if len(sys.argv) > 1:
        keep_history = sys.argv[1] == "keep_history"
    d = dict([a for a in ALL.items()[:limit]])
    total = float(limit)
    out = []
    i = 0
    record_count = 0
    # erase the log file
    with codecs.open(log_dir + "log_checkouts.txt", "w", encoding="utf8") as f:
        f.close()
    print "Exporting checkouts..."
    ft = open(export_dir + "CHECKOUTS.txt", "w")
    # fb = open(export_dir+'CHECKOUTS.marc.dat', 'wb')
    for (recid, record) in d.items():
        i = i + 1
        active_loan = not ("IDATE" in record)
        if keep_history or active_loan:
            rec_binary = format_record(recid)
            # fb.write(rec_binary.as_marc())
            ft.write(str(rec_binary))
            record_count = record_count + 1
        if i > limit:
            break
        update_progress(i * 100 / total)
    # fb.close()
    ft.close()
    print "\nCheckouts exported: %d/%d %s" % (
        record_count,
        limit,
        "(the rest are old loans)" if not keep_history else "",
    )
Example #4
0
def create_manifest(data_path, tag, ordered=True):
    manifest_path = '%s_manifest.csv' % tag
    file_paths = []
    wav_files = [
        os.path.join(dirpath, f)
        for dirpath, dirnames, files in os.walk(data_path)
        for f in fnmatch.filter(files, '*.wav')
    ]
    size = len(wav_files)
    counter = 0
    for file_path in wav_files:
        file_paths.append(file_path.strip())
        counter += 1
        update_progress(counter / float(size))
    print('\n')
    if ordered:
        _order_files(file_paths)
    counter = 0
    with io.FileIO(manifest_path, "w") as file:
        for wav_path in file_paths:
            ### Modified from utils.py to remove a replace step and add "_16k"...
            transcript_path = wav_path.replace('_16k.wav', '.txt')
            sample = os.path.abspath(wav_path) + ',' + os.path.abspath(
                transcript_path) + '\n'
            file.write(sample.encode('utf-8'))
            counter += 1
            update_progress(counter / float(size))
    print('\n')
Example #5
0
def get_articles(folder, sitemap):
    last_index = get_last_index('crawling/%s' % folder)
    total = len(sitemap)
    for index, a in enumerate(sitemap):
        key = hash(a['link'])
        if not key in loaded:
            loaded[key] = 1
            article = get_article_name(index + last_index)
            base = 'crawling/%s/%s' % (folder, article)
            try: 
                r = requests.get(a['link'], timeout=p.url_timeout)
                # r = urllib2.urlopen(a['link'])
                html = Soup(r.text)
                title = html.find('h1')
                if title:
                    title = getText(title)
                else:
                    title = ''
                content = content_extractor.analyze(r.content)
                if len(content.split(' ')) >= p.min_length:
                    # print([])
                    content = title.encode('utf=8') + '\n' + a['link'].encode('utf-8') + '\n' + content
                    utils.save_file(base + '.txt', content, False)
                    #get images
                get_images(base, a['images'])
            except requests.exceptions.Timeout:
                utils.save_file('cached.pkl', loaded)
                print("Timeout url: %s" % a['link'])
            except Exception as e:
                utils.save_file('cached.pkl', loaded)
                print("Error occured", e)
        utils.update_progress((index + 1) * 1.0 / total)
Example #6
0
def upload_processed_files_to_s3(_context, local_directory_path, pruned_filekey):
    directory_key = config.S3_UPLOAD_PATH + "/" + pruned_filekey + "/"

    s3 = get_boto_client()

    # Create directory
    s3.put_object(
        ACL='public-read',
        Bucket=config.AWS_S3_BUCKET,
        Key=directory_key
    )

    files = glob.glob(local_directory_path+"/*")
    for _idx, _file in enumerate(files):
        filename = _file.split("/")[-1]
        file_key = directory_key + filename
        resp = s3.put_object(
            ACL='public-read',
            Bucket=config.AWS_S3_BUCKET,
            Key=file_key,
            Body=open(_file).read()
        )
        progress_step_offset = 33 + 25 + 12
        progress_step_weight = (100 - (33 + 25 + 12))/100.0
        percent_complete = (_idx * 1.0 / len(files)) * 100
        update_progress(
            _context,
            (progress_step_offset + (progress_step_weight * percent_complete))
            )
Example #7
0
    def __index_corpus(self):
        num_tokens = 0
        num_sentences = 0
        vocabs = {}
        sentence_offset = []
        with open(self.path, 'r') as f:
            progress = 0
            sentence_offset.append(f.tell())
            line = f.readline()
            while line:
                num_sentences += 1
                if not (num_sentences % self.block_size):
                    sentence_offset.append(f.tell())
                if not (num_sentences % 10000):
                    progress = f.tell() * 1.0 / self.size
                    utils.update_progress(progress, "Counting vocabs", 40)

                tokens = line.strip().split()
                num_tokens += len(tokens)
                list(map(lambda x: utils.inc_dict_value(vocabs, x), tokens))

                line = f.readline()

            if progress < 1: utils.update_progress(1, "Counting vocabs", 40)
        return vocabs, sentence_offset, num_sentences
Example #8
0
def main(limit=len(ALL)):
    d = dict([a for a in ALL.items()[:limit]])
    total = float(limit)
    out = []
    i = 0
    record_count = 0
    with codecs.open(log_dir + 'log_serials.txt', 'w', encoding='utf8') as f:
        f.close()
    with codecs.open(log_dir + 'unmatched_serials.txt', 'w', encoding='utf8') as f:
        f.close()
    print 'Exporting serials...'
    ft = open(export_dir+'SERIALS.marc.txt', 'w')
    fb = open(export_dir+'SERIALS.marc.dat', 'wb')
    for (recid, record) in d.items():
        i = i + 1
    # if 'ACTIV' in record:
        rec_binary = format_record(recid)
        fb.write(rec_binary.as_marc())
        ft.write(str(rec_binary) + '\n==================\n')
        record_count = record_count + 1
        if i > limit:
            break
        update_progress(i*100/total)
    fb.close()
    ft.close()
    print "\nSerials exported: %d/%d" % (record_count, limit)
Example #9
0
def convert_midi_files_to_json(_context, filelist, pruned_filekey):
    converted_files = []
    for _idx, _file in enumerate(filelist):
        f = open(_file, "r")
        data = f.read()
        f.close()
        encoded_data = base64.b64encode(data)
        _d = {}
        _d['dataUri'] = "data:audio/midi;base64,"+encoded_data
        _d['_idx'] = _idx
        _d['key'] = pruned_filekey
        new_filepath = _file.replace(".midi", ".json")
        f = open(new_filepath, "w")
        f.write(json.dumps(_d))
        f.close()
        os.remove(_file)
        if not new_filepath.endswith("submission.json"):
            converted_files.append(
                new_filepath.replace(
                    config.TEMP_STORAGE_DIRECTORY_PATH, "")
                    )

        progress_step_offset = 33 + 25
        progress_step_weight = 0.12
        percent_complete = (_idx * 1.0 / len(filelist)) * 100
        update_progress(
            _context,
            (progress_step_offset + (progress_step_weight * percent_complete))
            )

    """
    These filekeys are relative to the `S3_UPLOAD_PATH` in the bucket
    and the first item of the returned array is the main submission file.
    """
    return [pruned_filekey+'/submission.json'] + converted_files
Example #10
0
def main():
    global d
    if not d:
        d = get_items('BARCD')
    total = float(len(d))
    i = 1
    fb = open(export_dir+'ITEMS.marc.dat', 'wb')
    ft = open(export_dir+'ITEMS.marc.txt', 'w')
    print 'Exporting items...'
    item_count = 0
    for (recid, copies) in d.items():
        if not is_staff_paper(recid):
            record = Record()
            id_field = Field(tag='999', indicators=[' ', ' '], subfields=['a', recid, 'b', ALL[recid].get('ID', '')])
            record.add_ordered_field(id_field)
            for c in copies.items():
                aux = [(e[0], items_fix[e[0]](e[1])) for e in c[1].items() if e[0] in items_fix]
                item_field = Field(tag='945', indicators=[' ', ' '], subfields= ['b', c[0]]+flatten_list(aux))
                record.add_ordered_field(item_field)
                item_count = item_count + 1
            fb.write(record.as_marc())
            ft.write(str(record) + '\n==================\n')
        update_progress(i*100/total)
        i = i + 1
    print "\nRecords:\t" + str(int(total))
    print "Items:  \t" + str(item_count)
    fb.close()
    ft.close()
Example #11
0
def prepare_dir(ted_dir):
    converted_dir = os.path.join(ted_dir, "converted")
    # directories to store converted wav files and their transcriptions
    wav_dir = os.path.join(converted_dir, "wav")
    if not os.path.exists(wav_dir):
        os.makedirs(wav_dir)
    txt_dir = os.path.join(converted_dir, "txt")
    if not os.path.exists(txt_dir):
        os.makedirs(txt_dir)
    counter = 0
    entries = os.listdir(os.path.join(ted_dir, "sph"))
    for sph_file in entries:
        speaker_name = sph_file.split('.sph')[0]

        sph_file_full = os.path.join(ted_dir, "sph", sph_file)
        stm_file_full = os.path.join(ted_dir, "stm", "{}.stm".format(speaker_name))

        assert os.path.exists(sph_file_full) and os.path.exists(stm_file_full)
        all_utterances = get_utterances_from_stm(stm_file_full)

        all_utterances = filter(filter_short_utterances, all_utterances)
        for utterance_id, utterance in enumerate(all_utterances):
            target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(utterance["filename"], str(utterance_id)))
            target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(utterance["filename"], str(utterance_id)))
            cut_utterance(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"],
                          sample_rate=args.sample_rate)
            with io.FileIO(target_txt_file, "w") as f:
                f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8'))
        counter += 1
        update_progress(counter / float(len(entries)))
Example #12
0
def compute_fleiss_kappa(num_categories, model_names, device, dataloader):
    num_subjects = len(dataloader.dataset)
    fleiss_input = np.zeros((num_subjects, num_categories))

    for model_name in model_names:
        model = torch.load(model_name)
        model = model.to(device)
        model.eval()
        start = time.time()

        for i, (inputs, _) in enumerate(dataloader):
            inputs = inputs.to(device)
            with torch.no_grad():
                outputs = model(inputs)
                for j, output in enumerate(outputs):
                    fleiss_input[i * len(outputs) + j][output] += 1

                if i % 100 == 0:
                    update_progress(i / len(dataloader.dataset))

        print(
            f'Time Elapsed: {time.time() - start:.0f}s, Accuracy: {accuracy:.2f}%'
        )

    return fleiss_kappa(fleiss_input)
Example #13
0
    def _evaluate(self):
        evalYHat = np.zeros([self.gen.nb_samples, self.gen.nb_classes])
        Y = np.zeros([self.gen.nb_samples, self.gen.nb_classes])
        iterGen = self.gen.begin()
        s_idx = 0

        if self.model is not None:
            for i in range(self.gen.nb_batches):
                utils.update_progress(i / self.gen.nb_batches)
                batch, y = next(iterGen)
                f_idx = s_idx + y.shape[-2]

                y_hat = self.model.predict_on_batch(x=batch)
                evalYHat[s_idx:f_idx, :] = y_hat
                Y[s_idx:f_idx, :] = y
                s_idx = f_idx

        else:
            evalYHat = self.yhat
            Y = self.y

        utils.update_progress(self.gen.nb_batches)
        print()
        accs, self.mP, self.mR, self.F1 = computeMultiLabelLoss(Y, evalYHat)
        self.mAP = computemAPLoss(Y, evalYHat)
        self.tp = accs[:, 1]
        self.fp = accs[:, 2]
        self.fn = accs[:, 3]
        self.precision = accs[:, 4]
        self.recall = accs[:, 5]
        self.nb_zeros = np.count_nonzero(accs[:, 1] == 0)
        self.Y_hat = evalYHat
        self.Y = Y
Example #14
0
 def update_list(self, url_list):
     total_items = len(url_list)
     if total_items:
         start_progress_dialog(True)
         #######################################################################################
         # TODO: 02 CANCEL AND RETURN IF PROGRESS BAR CANCEL BUTTON IS PRESSED
         #######################################################################################
         utils.update_progress(preset.message["loading_bookmarks"], -1,
                               total_items)
         for index, url in enumerate(url_list):
             self.update_element(index, url)
             url_object = preset.Header()
             url_object.set_data(url)
             self.url_objects.append(url_object)
             #######################################################################################
             # TODO: 03 LET USER CHANGE COLOR IN POPUP MENU https://wiki.wxpython.org/PopupMenuOnRightClick
             # TODO: 03 CHANGE COLOR IN POPUP MENU http://revxatlarge.blogspot.com/2011/06/wxpython-listbox-popupmenu.html
             # TODO: 03 CHANGE COLOR IN POPUP MENU https://www.daniweb.com/programming/software-development/threads/352474/wxpython-wx-listctrl-and-wx-menu
             #######################################################################################
             if index % 2:
                 self.list_ctrl.SetItemBackgroundColour(index, "#FFFFFF")
             else:
                 self.list_ctrl.SetItemBackgroundColour(index, "#EEEEEE")
             utils.update_progress(preset.message["loading_bookmarks"],
                                   index, total_items)
         self.update_column_width()
         start_progress_dialog(False)
     else:
         set_status_message(self.parent,
                            preset.message["user_has_no_bookmarks"])
Example #15
0
def evaluate_sp(url, url2, decoder_length=24, is_grid=True, grid_eval=True):
    cr = Crawling()
    map_ = heatmap.build_map()
    data = utils.load_file(url)
    if type(data) is list:
        data = np.asarray(data)
    if len(data.shape) == 4:
        lt = data.shape[0] * data.shape[1]
    else:
        lt = data.shape[0]
    if is_grid:
        data = np.reshape(data, (lt, data.shape[-2], 25, 25))
    else:
        data = np.reshape(data, (lt, data.shape[-2], 25))
    labels = utils.load_file(url2)
    labels = np.asarray(labels)
    loss_mae = 0.0
    loss_rmse = 0.0
    r2_total = 0.0
    for i, d in enumerate(data):
        d = d[:decoder_length, :, :]
        pred_t = []
        if is_grid:
            for d_ in d:
                d_t = heatmap.clear_interpolate_bound(np.asarray(d_), map_)
                pred_t.append(d_t)
        else:
            if grid_eval:
                for d_ in d:
                    d_t = heatmap.fill_map(d_, map_)
                    pred_t.append(d_t)
            else:
                pred_t = d
        lb_i = i * pr.strides + 24
        lbt = labels[lb_i:(lb_i + decoder_length), :, 0]
        if grid_eval:
            lbg = []
            for x in lbt:
                x_l = heatmap.fill_map(x, map_)
                lbg.append(x_l)
            lbg = np.asarray(lbg)
            lbg = lbg.flatten()
        else:
            lbg = lbt.flatten()
        pred_t = np.asarray(pred_t)
        pred_t = pred_t.flatten()
        mae, mse, r2 = get_evaluation(pred_t, lbg)
        loss_mae += mae
        loss_rmse += mse
        r2_total += r2
        utils.update_progress((i + 1.0) / lt)
    loss_mae = loss_mae / lt * 300
    loss_rmse = sqrt(loss_rmse / lt) * 300
    r2_total = r2_total / lt
    print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae)))
    print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse)))
    print("R2 Score: %.6f" % r2_total)
Example #16
0
def mser_detect(img, x_len, y_len):
    utils.update_progress('Detecting Regions')

    min_t = int(math.floor((y_len * x_len) * 0.0009))
    max_t = int(math.floor((y_len * x_len) * 0.05))

    #MSER(5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5)  <- Default Values
    c_mser = cv2.MSER(5, min_t, max_t, 0.166, 0.153, 90, 1.001, 0.003, 5)
    c_regions = c_mser.detect(img, None)
    return [cv2.convexHull(p.reshape(-1, 1, 2)) for p in c_regions]
Example #17
0
def evaluate_lstm(url,
                  url2,
                  decoder_length=24,
                  forecast_factor=0,
                  is_classify=False):
    data = utils.load_file(url)
    if type(data) is list:
        data = np.asarray(data)
    lt = data.shape[0] * data.shape[1]
    data = np.reshape(data, (lt, data.shape[-1]))
    if decoder_length > data.shape[-1]:
        decoder_length = data.shape[-1]
    dtl = len(data)
    labels = utils.load_file(url2)
    labels = np.asarray(labels)
    if not is_classify:
        loss_mae = [0.0] * decoder_length
        loss_rmse = [0.0] * decoder_length
    else:
        acc = 0.
    #: r2_total = 0.0
    cr = Crawling()
    for i, d in enumerate(data):
        if decoder_length < data.shape[-1]:
            pred_t = d[:decoder_length]
        else:
            pred_t = d
        lb_i = i * pr.strides + 24
        lbt = np.mean(labels[lb_i:(lb_i + decoder_length), :, forecast_factor],
                      axis=1)
        a = 0.
        for t_i, (p, l) in enumerate(zip(pred_t, lbt)):
            if not is_classify:
                # mae, mse, _ = get_evaluation(p, l)
                mae = abs(cr.ConcPM10(p * 300) - cr.ConcPM10(l * 300))
                loss_mae[t_i] += mae
                # loss_rmse[t_i] += mse
            else:
                a += classify_data(pred_t, lbt, forecast_factor)
        if is_classify:
            a = a / decoder_length
            acc += a
        # r2_total += r2
        utils.update_progress((i + 1.0) / dtl)
    if not is_classify:
        loss_mae = np.array(loss_mae) / lt
        # loss_rmse = [sqrt(x / lt)  * 300 for x in loss_rmse]
        # print("R2 score: %.6f" % r2_total)
        print_accumulate_error(loss_mae,
                               loss_rmse,
                               decoder_length,
                               forecast_factor=forecast_factor)
    else:
        acc = acc / lt * 100
        print("accuracy %.4f" % acc)
Example #18
0
def parse_time_series(config: BaseConfig, well_manager: WellManager,
                      data_directory: str):
    filepath = join(data_directory, config.filename)
    # Parse header
    # Keeping this generic, in case column orders change in subsequent data files
    infile = open(filepath)
    header_line = infile.readline()

    try:
        header_to_column = _get_column_indices(
            header_line, config.headers_to_read + [TimeSeriesHeaderTypes.api])
    except KeyError:
        raise LookupError(
            "Header value specified in config was not found in data file!")

    current_api = -1
    temp_lists = _get_temporary_lists(config.headers_to_read)

    all_lines = infile.readlines()
    total_lines = len(all_lines)
    parsed_lines = 0

    # Parse data
    for l in all_lines:
        sp = l.replace("\"", "").split(',')
        api = sp[header_to_column[TimeSeriesHeaderTypes.api]]
        if api != current_api:  #Write in large batches to minimize resizing of underlying numpy array
            _write_to_well(well_manager, api, temp_lists)

            temp_lists = _get_temporary_lists(config.headers_to_read)
            current_api = api
        try:
            date = datetime.strptime(
                sp[header_to_column[TimeSeriesHeaderTypes.date]],
                '%Y-%m-%d %H:%M:%S').timestamp(
                )  #Convert to seconds since linux epoch. Ignoring timezones
            oil_barrels = float(
                sp[header_to_column[TimeSeriesHeaderTypes.oil_barrels]])
            water_barrels = float(
                sp[header_to_column[TimeSeriesHeaderTypes.water_barrels]])
            gas_mcf = float(
                sp[header_to_column[TimeSeriesHeaderTypes.gas_mcf]])
        except ValueError:
            continue

        temp_lists[TimeSeriesHeaderTypes.water_barrels].append(water_barrels)
        temp_lists[TimeSeriesHeaderTypes.date].append(date)
        temp_lists[TimeSeriesHeaderTypes.oil_barrels].append(oil_barrels)
        temp_lists[TimeSeriesHeaderTypes.gas_mcf].append(gas_mcf)

        parsed_lines += 1
        if parsed_lines % 100 == 0:
            update_progress(parsed_lines / total_lines)

    _write_to_well(well_manager, api, temp_lists)
Example #19
0
def monitor(files):
    global done
    a = 0
    while True:
        incr = int(ceil((40 / files) * done))
        while a < incr:
            update_progress(None, 1)
            a = a + 1
        if a == 40 or BREAK or JOB_DONE:
            return
        continue
Example #20
0
def ES(cost_func,
       lb,
       ub,
       num_parents,
       num_children,
       num_generations,
       mutation,
       run_name='runs'):
    mu = num_parents
    lam = num_children
    all_params = np.zeros([num_generations + 1, num_children, len(lb)])
    all_costs = np.full([num_generations + 1, num_children], math.inf)

    P = np.zeros([mu, len(lb)])
    Pcost = np.zeros([mu, 1])
    print("Generating Parents")
    for i in range(mu):
        update_progress(i / (mu - 1))
        P[i, :], Pcost[i] = generate_parent(cost_func, lb, ub)

    all_params[0, 0:num_parents, :] = P
    all_costs[0, 0:num_parents] = Pcost[:, 0]

    best_costs = np.zeros([num_generations, 1])

    for g in range(num_generations):
        starttime = time.clock()
        print("Creating generation " + str(g))
        G = np.zeros([lam, len(lb)])
        Gcost = np.zeros([1, lam])
        print("Starting generation " + str(g))
        # generate the children for generation g
        for i in range(lam):
            G[i, :], Gcost[:, i] = generate_child_ES(cost_func, P, lb, ub,
                                                     mutation)

        # assign parents for next generation
        # to do this, I must first sort the children of this generation
        idx = np.argsort(Gcost)
        Gcost = Gcost[:, idx]
        G = G[idx]
        all_params[g + 1, :, :] = G
        all_costs[g + 1, :] = Gcost
        Pcost = Gcost[0, :, 0:mu].T
        P = G[0, 0:mu, :]
        best_costs[g] = Pcost[np.argmin(Pcost)]
        endtime = time.clock()
        print("Generation {} runtime: {}".format(g, endtime - starttime))
        print("Best of generation " + str(g) + " has cost " +
              str(best_costs[g]))

    np.savetxt(run_name + '/best_costs.csv', best_costs)
    return P, Pcost
Example #21
0
def getLocalsForAllRegions(data,
                           calc,
                           history_lengths=None,
                           delays=None,
                           parameters=None,
                           print_max_idx=True,
                           compute_p=False):
    """
    Calculates the local AIS for all regions, by calling getLocalsForRegion

    Arguments:
        data -- Numpy array of shape (region, time). Preprocessing should have already been performed
        calc -- The JIDT calculator
        history_lengths -- Range of possible history length values, or None
        delays -- Range of possible delay values, or None
        print_max_idx -- If True, prints the maximum average AIS value and the corresponding indices for the
                         parameters. The first value gives the maximum index in the range of possible history
                         lengths, and the second value gives the maximum index in the range of possible delays
        parameters -- A DataFrame or numpy array containing a column of history lengths and a column of delays
                      Each row should correspond to a particular region
        compute_p -- If True, computes the p value of the returned AIS
    
    Returns:
        results -- A numpy array of shape (regions, timepoints), containing the local AIS values for each region
        all_parameters -- A numpy array with three columns, containing the (history_length, delay, DCE) of each region
        p_values -- A numpy array of all returned p values (or Nones if compute_p is False). Each row corresponds to a region
    """
    regions, timepoints = data.shape

    # Initialise
    results = np.zeros((regions, timepoints))
    all_parameters = np.zeros((regions, 3), dtype=int)
    p_values = np.zeros(regions)

    for region in range(regions):
        # Either parameters are provided, or the range of possible history lengths and delays should be provided
        if parameters is None:
            assert history_lengths is not None and delays is not None
            params = None
        else:
            if isinstance(parameters, pd.DataFrame):
                params = parameters.loc[region].values
            else:  # Numpy array or list, etc
                params = parameters[region]
        results[region], _, params, p_values[region] = getLocalsForRegion(
            data, calc, region, history_lengths, delays, params, print_max_idx,
            compute_p)
        all_parameters[region] = np.array(params)
        utils.update_progress(region / regions)  # Print progress bar
    return results, all_parameters, p_values
Example #22
0
def check_answer(username='', assignment='', ipd=-1, answer=''):
    if not assignment in Global.progress[username]:
        Global.progress[username][assignment] = False
        utils.update_progress(username, Global.progress[username])

    ipd_idx = Global.data_dict[assignment]['ipd'].index(ipd)
    x_answer = np.array(json.loads(answer))
    x_true = Global.data_dict[assignment]['outputs'][ipd_idx]

    answer_is_correct = np.allclose(x_true, x_answer, atol=1e-5)
    if not Global.progress[username][assignment] and answer_is_correct:
        Global.progress[username][assignment] = answer_is_correct
        utils.update_progress(username, Global.progress[username])

    return jsonify({'success': answer_is_correct})
Example #23
0
def fem_movie(foldername='images/', T=20.0):
    y = np.array([2., 8.1, 2.45, 9])
    dt = 1.0 / 30
    vertices = np.array([[0., 0], [4, 0], [4, 10], [0, 10]])
    mesh = finis.triangulate(vertices=vertices, max_area=0.001)
    fe_u = finis.fe_space(mesh, order=2, order_int=2)
    fe_p = finis.fe_space(mesh, order=1, order_int=2)

    u1_h, u2_h, p_h, dy = fem_solve(fe_u, fe_p, y, eps=1e-6)
    fig = plt.figure(figsize=(9, 5))
    ax = fig.add_subplot(1, 1, 1)
    tricol = plt.tripcolor(fe_u['dof'][:, 0],
                           fe_u['dof'][:, 1],
                           u2_h,
                           shading='flat',
                           vmin=-2,
                           vmax=2)
    c1 = plt.Circle((y[0], y[1]), .25, color='w')
    ax.add_artist(c1)
    c2 = plt.Circle((y[2], y[3]), .25, color='w')
    ax.add_artist(c2)

    ax.set_title("$u_y$")
    ax.set_xlabel("$x$")
    plt.colorbar()
    ax.set_aspect('equal')

    _i_max = int(T / dt) + 2
    starttime = time.time()
    for _i in range(_i_max):
        if _i > 0:
            u1_h, u2_h, p_h, dy = fem_solve(fe_u, fe_p, y, eps=1e-6)
        ax.clear()
        c1 = plt.Circle((y[0], y[1]), .25, color='w')
        ax.add_artist(c1)
        c2 = plt.Circle((y[2], y[3]), .25, color='w')
        ax.add_artist(c2)
        tricol = plt.tripcolor(fe_u['dof'][:, 0],
                               fe_u['dof'][:, 1],
                               u2_h,
                               shading='flat',
                               vmin=-2,
                               vmax=2)
        plt.savefig(foldername + 'movie{}.png'.format(_i))
        np.save(foldername + 'y{}.npy'.format(_i), y)

        update_progress(_i / _i_max, starttime=starttime)
        y = y + dt * dy
def convert_data_to_grid(url, out_url, url_att="", out_url_att="", part=1):
    grid = heatmap.build_map(pr.map_size)
    data = utils.load_file(url)
    lt = len(data)
    attention_data = None
    att_part = None
    print(url_att)
    if url_att:
        attention_data = utils.load_file(url_att)
        alt = len(attention_data)
        if lt != alt:
            raise ValueError(
                "Attention & Main Data need same length while %s and %s" %
                (lt, alt))
        data = zip(data, attention_data)
        att_part = []
    res = []
    if part != 1:
        bound = int(math.ceil(float(lt) / part))
    else:
        bound = lt
    for i, row in enumerate(data):
        if url_att:
            t, a = row
        else:
            t = row
        if i and (i % bound) == 0:
            p_i = i / bound
            out_url_name = out_url + "_" + str(p_i)
            utils.save_file(out_url_name, res)
            if url_att:
                att_out_url_name = out_url_att + "_" + str(p_i)
                utils.save_file(att_out_url_name, att_part)
            res = []
            att_part = []
        g = heatmap.fill_map(t, grid)
        res.append(g)
        if url_att:
            att_part.append(a)
        utils.update_progress(float(i) / lt)
    if part == 1:
        out_url_name = out_url
    else:
        out_url_name = out_url + "_" + str(part)
    utils.save_file(out_url_name, res)
    if url_att:
        att_out_url_name = out_url_att + "_" + str(part)
        utils.save_file(att_out_url_name, att_part)
Example #25
0
 def execute(self, args):
     print("start crawling aws")
     save_interval = args.save_interval
     start = datetime.strptime(args.start, pr.fm)
     start_point = utils.get_datetime_now()
     # output = "timestamp,PM10_VAL,PM2.5_VAL,O3(ppm),NO2(ppm),CO(ppm),SO2(ppm),PM10_AQI,PM2.5_AQI\n"
     output = ""
     counter = 0
     last_save = 0
     crawler_range = 86400
     if not args.forward:
         if args.end:
             end = datetime.strptime(args.end, pr.fm)
         else:
             end = utils.get_datetime_now()
         length = (end - start).total_seconds() / crawler_range
     else:
         end = datetime.strptime("2050-12-31 00:00:00", pr.fm)
     while start <= end:
         now = utils.get_datetime_now()
         # at first, crawling by daily
         # if up to the moment, crawling by hourly
         # how long from last crawled date to now?
         if (now - start).total_seconds() > crawler_range:
             tmp = start
             st = "00"
             ed = "24"
             if crawler_range != 86400:
                 st = self.format10(tmp.hour)
                 ed = self.format10(tmp.hour + 1)
             output, counter, last_save = self.craw_data_controller(
                 output, counter, last_save, save_interval, tmp, st, ed)
             # move pointer for timestep
             if not args.forward:
                 utils.update_progress(counter * 1.0 / length)
             else:
                 self.write_log(output)
                 output = ""
             if crawler_range == 86400:
                 start = start + timedelta(days=1)
             else:
                 start = start + timedelta(hours=1)
             print("AWS done")
         else:
             # Approach boundary (reach end) then reduce range to hourly crawling
             crawler_range = 3600
     self.write_log(output)
Example #26
0
def word_segment(root):
    directory = 'seg/%s' % get_container_folder(root)
    ut.create_folder(directory)
    files = [
        f for f in os.listdir(root) if os.path.isfile('%s/%s' % (root, f))
    ]
    total = len(files)
    for index, f in enumerate(files):
        path = '%s/%s' % (root, f)
        content = ut.load_file(path)
        if len(content) >= 3:
            title = content[0].replace('\n', '')
            par = content[2].replace('\n', '')
            title = ViTokenizer.tokenize(unicode(title, 'UTF-8'))
            par = ViTokenizer.tokenize(unicode(par, 'UTF-8'))
            ut.save_file_utf8('%s/%s' % (directory, f), title + '\n' + par)
        ut.update_progress((index + 1) * 1.0 / total)
Example #27
0
def accuracy_generalization_matrix(model_names, datasets, device):
    num_models = len(model_names)
    num_datasets = len(datasets)
    result = np.zeros((num_models, num_datasets))

    dataloaders = [
        create_dataset(dataset[1], dataset[2], dataset[3], train=False)
        for dataset in datasets
    ]

    for i, model_name in enumerate(model_names):
        model = torch.load(model_name)
        model = model.to(device)

        model.eval()
        for j, (dataset, dataloader) in enumerate(zip(datasets, dataloaders)):
            start = time.time()

            print(f'Evaluating {model_name} on data {dataset[0]}...')
            for k, (inputs, labels, _) in enumerate(dataloader):
                corrects = 0

                inputs = inputs.to(device)
                labels = labels.to(device)

                with torch.no_grad():
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)

                corrects += torch.sum(preds == labels)

                if k % 100 == 0:
                    update_progress(k / len(dataloader.dataset))

            accuracy = corrects / len(dataloader.dataset)
            result[i][j] = accuracy

            print(
                f'Time Elapsed: {time.time() - start:.0f}s, Accuracy: {accuracy:.2f}%'
            )

    print(f'---- Cross-Dataset Generalization ----')
    print(result)

    return result
Example #28
0
def create_classifier(iterations=100):
    """
    Return the classifier that did the best at classifying a subset of the data
    after training for the given number of iterations

    :param iterations: number of iterations to test on
    :return:    tuple: (classifier, accuracy of classifier) 
    """
    negids = reddit_politics.fileids("neg")
    posids = reddit_politics.fileids("pos")

    negfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "neg") for f in negids]
    posfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "pos") for f in posids]

    # track the most accurate classifier so far
    best_classifier = None
    highest_accuracy = 0
    for iter_num in range(iterations):
        # randomly shuffle the feature sets to get new subsets to test and train on
        random.shuffle(negfeats)
        random.shuffle(posfeats)

        negcutoff = int(len(negfeats) * 3 / 4)
        poscutoff = int(len(posfeats) * 3 / 4)

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]  # negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]  # negfeats[negcutoff:] + posfeats[poscutoff:]

        if DEBUG:
            print("Train on %d instances, test on %d instances.\n" % (len(trainfeats), len(testfeats)))

        # train the classifier on the training features and determine its accuracy
        classifier = NaiveBayesClassifier.train(trainfeats)
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)

        if DEBUG:
            print("\nAccuracy:", accuracy)

        # if this classifier outperformed all before it, track it and its accuracy
        if accuracy > highest_accuracy:
            highest_accuracy = accuracy
            best_classifier = classifier
        utils.update_progress(iter_num / iterations, message="Testing Classifiers")
    sys.stdout.write("\n\n")
    return (classifier, highest_accuracy)
Example #29
0
def get_results():
    update_progress('Fetching Object Recognition Results')
    # Initialize up to concurrent number of threads

    for i in range(concurrent):
        t = Thread(target=cloudsight_fetch)
        threads.append(t)
        t.start()
    # Initialize our monitoring thread for progress reporting
    t = Thread(target=monitor, args=(len(glob('/tmp/*.jpg')), ))
    threads.append(t)
    t.start()

    # Feed filenames into queue to be consumed by threads
    for filename in glob('/tmp/*.jpg'):
        resultdict[filename] = {
            "cloudsight": [],
            "msft": [],
            "ibm": [],
            "google": []
        }
        fail_check()
        time.sleep(3.1)  # Cloudsight Limit 1 req/3 sec
        cs_q.put(filename)
        break  #!!!!!!! REMOVE FOR FULL PROCESSING. RIGHT NOW ONLY 1 IMAGE WILL BE SENT.!!!!!!!!!!!!!
    # We now announce to all threads that queue feeding has stopped.
    # If they see both FLAG = true and the queue is empty, thread will quit
    FLAG = True

    # Keep checking until the queue is empty, and check for any errors
    while True:
        if cs_q.empty() == False:
            fail_check()
            time.sleep(1)
        else:
            break
    # Join all remaining threads from queue if not done so by now.
    cs_q.join()

    # Declare job is done to monitor thread so it may quit now
    JOB_DONE = True

    return resultdict
Example #30
0
    def run(self):
        open(self.case_db_path, 'w').close()

        forms_per_user = float(settings.CASES_PER_USER * settings.FORMS_PER_CASE)

        for i, user_id in enumerate(self.user_ids):
            print('\n\n## Loading data for user {} of {}'.format(i, self.num_users))
            synclog_id = self.get_synclog_id(user_id)
            num_cases_user = 0
            num_forms_user = 0
            with self.form_loader as loader:
                while num_forms_user < forms_per_user:
                    create_case = num_cases_user < settings.CASES_PER_USER
                    form = self.get_form(user_id, synclog_id, create_case)
                    loader.put_doc(form)
                    num_forms_user += 1
                    if create_case:
                        num_cases_user += 1

                    if num_forms_user % 50 == 0:
                        update_progress('Forms:', num_forms_user / forms_per_user)

            self.num_forms += num_forms_user
            self.num_cases += num_cases_user

            print('')
            with self.case_loader as loader:
                case_ids = self.case_forms.keys()
                num_cases = float(len(case_ids))
                for j, case_id in enumerate(case_ids):
                    is_child_case = random.random() < settings.CHILD_CASE_RATIO
                    forms = self.case_forms[case_id]
                    case = self.get_case(user_id, case_id, forms, is_child_case)
                    loader.put_doc(case)

                    cases_created = j + 1
                    if cases_created % 50 == 0:
                        update_progress('Cases:', cases_created / num_cases)

            self.save_database_and_clear()

        self.print_actual()
 def main(self, args):
     #filename = "craw_weather_%s_%s_%s.txt" % (args.city, utils.clear_datetime(args.start), utils.clear_datetime(args.end))
     start = datetime.strptime(args.start, pr.fm)
     if args.end:
         end = datetime.strptime(args.end, pr.fm)
     else:
         end = utils.get_datetime_now()
     start_point = utils.now_milliseconds()
     # output = "timestamp,PM10_VAL,PM2.5_VAL,O3(ppm),NO2(ppm),CO(ppm),SO2(ppm),PM10_AQI,PM2.5_AQI\n"
     output = ""
     length = (end - start).total_seconds() / 86400.0
     save_interval = args.save_interval
     counter = 0
     last_save = 0
     if "," in args.city:
         cities = args.city.split(",")
     else:
         cities = [args.city]
     while start <= end:
         now = utils.now_milliseconds()
         diff = now - start_point
         # print(elapsed_time)
         if diff >= 100:
             # try:
             counter += 1
             date = "%s-%s-%s" % (start.year, self.format10(
                 start.month), self.format10(start.day))
             for c in cities:
                 html = self.craw_data(c, date)
                 data = self.mine_data(date, html, c)
                 if data:
                     output += "\n".join(data) + "\n"
                 if (counter - last_save) == save_interval:
                     last_save = counter
                     self.write_log(output)
                     output = ""
             # except Exception as e:
             #    print(start.strftime(pr.fm), e)
             start = start + timedelta(days=1)
             start_point = now
             utils.update_progress(counter * 1.0 / length)
     self.write_log(output)
Example #32
0
    def __get_vocabs(self):
        num_tokens = 0
        num_sentences = 0
        vocabs = dict
        with open(self.path, 'r') as f:
            progress = 0
            line = f.readline()
            while line:
                num_sentences += 1
                if not (num_sentences % 10000):
                    progress = f.tell() * 1.0 / self.size
                    utils.update_progress(progress, "Counting vocabs", 40)

                tokens = line.rstrip().split()
                num_tokens += len(tokens)
                list(map(lambda x: utils.inc_dict_value(vocabs, x), tokens))

                line = f.readline()

            if progress < 1: utils.update_progress(1, "Counting vocabs", 40)
        return vocabs, num_tokens, num_sentences
Example #33
0
def main(limit=len(ALL)):
    d = dict([a for a in ALL.items()[:limit]])
    total = float(limit)
    out = []
    i = 0
    record_count = 0
    print 'Exporting orders...'
    ft = open(export_dir+'ORDERS.marc.txt', 'w')
    fb = open(export_dir+'ORDERS.marc.dat', 'wb')
    for (recid, record) in d.items():
        i = i + 1
        rec_binary = format_record(recid)
        fb.write(rec_binary.as_marc())
        ft.write(str(rec_binary) + '\n==================\n')
        record_count = record_count + 1
        if i > limit:
            break
        update_progress(i*100/total)
    fb.close()
    ft.close()
    print "\nOrders exported: %d/%d" % (record_count, limit)
Example #34
0
def import_cities_to_redis(connection, location_file, lang):
    key = 'cityid2city:' + lang + ':'
    with open(location_file) as csvfile:
        reader = csv.DictReader(csvfile)
        update_progress("import_cities_to_redis", 0)
        row_count = sum(1 for row in reader)
        count = 0
        csvfile.seek(0)
        for row in reader:
            if count == 0:
                count = count + 1
                continue
            count = count + 1
            city_id = row['geoname_id']
            continent_code = row['continent_code']
            continent_name = row['continent_name']
            country_code = row['country_iso_code']
            country_name = row['country_name']
            subdivision_code = row['subdivision_1_iso_code']
            subdivision_name = row['subdivision_1_name']
            city_name = row['city_name']
            metro_code = row['metro_code']
            if count % 1000 == 0:
                update_progress("import_cities_to_redis",
                                count / float(row_count))
            connection.hset(
                key, city_id,
                json.dumps([
                    continent_code, continent_name, country_code, country_name,
                    subdivision_code, subdivision_name, city_name, metro_code
                ]))
        update_progress("import_cities_to_redis", 1)
 def execute(self, args):
     print("start crawling aqi seoul")
     save_interval = args.save_interval
     start = datetime.strptime(args.start, pr.fm)
     # start_point = utils.get_datetime_now()
     output = ""
     counter = 0
     last_save = 0
     # crawler_range = 3600
     if not args.forward:
         if args.end:
             end = datetime.strptime(args.end, pr.fm)
         else:
             end = utils.get_datetime_now()
         length = (end - start).total_seconds() / 86400
     else:
         end = datetime.strptime("2050-12-31 00:00:00", pr.fm)
     while start <= end:
         now = utils.get_datetime_now()
         # if (now - start_point).total_seconds() >= args.interval:
         #     start_point = now
         if (now - start).total_seconds() > 3600:
             hour = start.hour
             tmp = start
             if tmp.hour == 0:
                 tmp = tmp - timedelta(hours=1)
                 hour = "24"
             else:
                 hour = self.format10(tmp.hour)
             st_ = start.strftime(pr.fm)
             output, counter, last_save = self.craw_data_controller(output, counter, last_save, save_interval, tmp, hour, st_)
             # move pointer for timestep
             start = start + timedelta(hours=1)
             if not args.forward:
                 utils.update_progress(counter * 1.0 / length)
             else:
                 self.write_log(output)
                 output = ""
     self.write_log(output)      
Example #36
0
    def gen_dataset(self, params, dataset_type, seed=0):
        random.seed(seed)
        np.random.seed(seed)
        if dataset_type == 'train':
            num_seq = params.TRAIN_NUM_SEQ
            path = params.TRAIN_SET_PATH
        elif dataset_type == 'val':
            num_seq = params.VAL_NUM_SEQ
            path = params.VAL_SET_PATH
        elif dataset_type == 'test':
            num_seq = params.TEST_NUM_SEQ
            path = params.TEST_SET_PATH
        else:
            raise ValueError('dataset_type must be train, val, or test')

        for i in range(num_seq):
            sdg = SequenceDataGenerator(params.NUM_SHAPE, params.IMG_SIZE,
                                        params.SEQUENCE_LEN,
                                        params.RANDOM_SIZE,
                                        params.ROTATE_SHAPES)
            utils.update_progress(i / num_seq)
            seq = sdg.get_sequence()
            pickle_folder_path = os.path.join(path, f'seq_{i}')
            utils.mkdir_if_missing(pickle_folder_path)
            pickle_full_path = os.path.join(pickle_folder_path,
                                            'sequence.pickle')
            with open(pickle_full_path, 'wb') as handle:
                pickle.dump(seq, handle)
            image_count = 0
            for info in seq:
                image = info['image']
                image_folder_path = os.path.join(path, f'seq_{i}', 'images')
                utils.mkdir_if_missing(image_folder_path)
                image_full_path = os.path.join(image_folder_path,
                                               f'{image_count:05d}.png')
                image = (image * 255).astype(np.uint8)
                cv2.imwrite(image_full_path, image)
                image_count += 1
Example #37
0
def main(limit=len(ALL)):
    d = dict([a for a in ALL.items()[:limit]])
    total = float(limit)
    out = []
    i = 1
    record_count = 0
    #erase the log file
    with codecs.open(log_dir + 'log_bib.txt', 'w', encoding='utf8') as f:
        f.close()
    with codecs.open(log_dir + 'authors_split.txt', 'w', encoding='utf8') as f:
        f.close()
    w = codecs.open(export_dir+'BIBLIOGRAPHIC.marc.txt', 'w')
    UTF8Writer = codecs.getwriter('utf8')
    f = UTF8Writer(w)
    # f = codecs.open(export_dir+'BIBLIOGRAPHIC.marc.txt', 'w', encoding='utf8')
    fb = open(export_dir+'BIBLIOGRAPHIC.marc.dat', 'wb')
    print 'Exporting bibliographic records...'
    for (recid, record) in d.items():
        # print recid
        # out.append(format_record(recid))
        if record['TI'].find('eReader') == -1:
            # if not 'HIDE' in record:
            rec_binary = format_record(recid)
            # print recid
            # print rec_binary
            f.write(str(rec_binary) + u'\n==================\n')
            fb.write(rec_binary.as_marc())
            record_count = record_count + 1

        update_progress(i*100/total)
        i = i + 1
        if i > limit:
            break
    print "\ntotal records:  \t" + str(i-2)
    print "exported records:\t" + str(record_count)
    f.close()
    fb.close()
Example #38
0
def generate_subimages(hulls, img, h, v, folder='/tmp/'):
    utils.update_progress('Extracting Regions')
    fid = 0
    pathlist = []

    coords = edge_coordinates(hulls)
    if len(coords) == 0:
        cv2.imwrite('/tmp/image.jpg', img)
        pathlist.append(('/tmp/image.jpg', (0, h, 0, v)))
        return pathlist
    else:
        # p_c = padded coordinates. c = coordinates.
        for p_c, c in process_coords(coords, h, v):
            p_x1, p_y1, p_x2, p_y2 = p_c
            x1, y1, x2, y2 = c
            roi = img[p_y1:p_y2, p_x1:p_x2]
            fid = fid + 1
            path = folder + str(fid) + '.jpg'
            pathlist.append(
                (path, p_c, c, (x1 + ((x2 - x1) // 2), y1 + ((y2 - y1) // 2))))
            # Write subimage to file
            cv2.imwrite(path, roi)

    return pathlist
Example #39
0
 def _read_lexicon_to_memory(cls, file_location):
     print "\nReading lexicon to memory..."
     lexicon = codecs.open(file_location, 'rb', 'utf-8')
     lexicon_list = lexicon.readlines()
     if ENV.PROGRESS_BAR == True:    
         util.update_progress(0)
     for idx, entry in enumerate(lexicon_list):
         if ENV.PROGRESS_BAR == True:
             util.update_progress(float(idx) / float(len(lexicon_list)))
         entry = entry.replace('\n', '').split(' ')
         entry[0] = int(entry[0])
         entry[2] = int(entry[2])
         lexicon_list[idx] = entry
     if ENV.PROGRESS_BAR == True:
         util.update_progress(1)
     lexicon.close()
     return lexicon_list
Example #40
0
 def _read_full_postings_to_memory(cls, file_location):
     print "\nReading posting list to memory..."
     postings = codecs.open(file_location, 'rb', 'utf-8')
     posting_list = {}
     posting_lines = postings.readlines()
     if ENV.PROGRESS_BAR == True:
         util.update_progress(0)
     for idx, line in enumerate(posting_lines):
         if ENV.PROGRESS_BAR == True:
             util.update_progress(float(idx) / float(len(posting_lines)))
         line = line.replace('\n', '').split(': ')
         doc_info = line[1].split('->')
         for idx, doc in enumerate(doc_info):
             doc = re.sub(r'[\(\)]', '', doc)
             doc = doc.split(', ')
             doc_info[idx] = [int(doc[0]), int(doc[1])]
         posting_list[int(line[0])] = doc_info
     if ENV.PROGRESS_BAR == True:
         util.update_progress(1)
     postings.close()
     return posting_list
Example #41
0
 def _read_doc_list_to_memory(cls, file_location):
     print "\nExtracting Document List..."
     doc_lengths = []
     documents = codecs.open(file_location, 'rb', 'utf-8')
     document_list = documents.readlines()
     doc_dict = {}
     if ENV.PROGRESS_BAR == True:
         util.update_progress(0)
     for idx, entry in enumerate(document_list):
         if ENV.PROGRESS_BAR == True:
             util.update_progress(float(idx) / float(len(document_list)))
         entry = entry.replace('\n', '').split(' ')
         entry[0] = int(entry[0])
         entry[1] = int(entry[1])
         document_list[idx] = entry
         doc_dict[entry[0]] = {'length': entry[1]}
         doc_lengths.append(entry[1])
     if ENV.PROGRESS_BAR == True:
         util.update_progress(1)
     cls.avg_doc_length = np.mean(doc_lengths)
     cls.collection_length = np.sum(doc_lengths)
     return doc_dict
Example #42
0
    posts = get_posts(start_date, end_date)

    # dictionary containing candidates mapped to lists of sentiment values for that candidate
    sentiments = dict()
    totals = dict()
    overall_total = 0
    num_candidates = len(list(posts.keys()))
    current = 1
    for candidate in posts:
        sentiments[candidate] = []
        totals[candidate] = 0
        for score, text in posts[candidate]:
            sentiments[candidate].append(classify(classifier, text, score))
            totals[candidate] += 1
            overall_total += 1
        utils.update_progress(current / num_candidates, message=candidate)
        current += 1
    # clear the progress bars for the candidates
    sys.stdout.write("\r" + " " * 70 + "\n")

    # normalize the values to 0
    lowest = 0
    for candidate in sentiments:
        avg = sum(sentiments[candidate]) / overall_total
        if avg < lowest:
            lowest = avg

    # display sentiment values for each candidate to the console
    print("\nRelative Sentiment Values:")
    print("(normalized to 0, higher is more positive)\n")
    for candidate in sentiments:
Example #43
0
 def _extract_document_summations(cls):
     print "\nExtracting document tf-idf summations for use in Vector Space Cosine..."
     if ENV.PROGRESS_BAR == True:
         util.update_progress(0)
     # for every term in our posting list
     for idx, term in enumerate(cls.posting_list):
         if ENV.PROGRESS_BAR == True:
             util.update_progress(float(idx) / float(len(cls.posting_list)))
         docs = cls.posting_list[term]
         # run through the documents for each term and add the additional tfidf to an accumulation in the dict
         for doc in docs:
             tfidf_addition = qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys()))
             tfidf_addition_squared = np.square(tfidf_addition)
             if 'tf_idf_sum' in cls.doc_list[doc[0]]:
                 cls.doc_list[doc[0]]['tf_idf_sum'] += tfidf_addition_squared
             else:
                 cls.doc_list[doc[0]]['tf_idf_sum'] = tfidf_addition_squared
     if ENV.PROGRESS_BAR == True:
         util.update_progress(1)
     
     print "\nExtracting document weight summations for use in Vector Space Cosine..."
     if ENV.PROGRESS_BAR == True:
         util.update_progress(0)
     # Again, we run through each term in our posting list
     for idx, term in enumerate(cls.posting_list):
         if ENV.PROGRESS_BAR == True:
             util.update_progress(float(idx) / float(len(cls.posting_list)))
         docs = cls.posting_list[term]
         # each doc within each term has the VS weight calculated for the terms to find a summation
         for doc in docs:
             weight_addition = float(qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys()))) / float(cls.doc_list[doc[0]]['tf_idf_sum'])
             weight_addition_squared = np.square(weight_addition)
             if 'sum_weight' in cls.doc_list[doc[0]]:
                 cls.doc_list[doc[0]]['sum_weight'] += weight_addition_squared
             else:
                 cls.doc_list[doc[0]]['sum_weight'] = weight_addition_squared
     if ENV.PROGRESS_BAR == True:
         util.update_progress(1)
Example #44
0
        shell=True
    )
    duration = float(output)
    if prune_min or prune_max:
        duration_fit = True
        if prune_min:
            if duration < args.min_duration:
                duration_fit = False
        if prune_max:
            if duration > args.max_duration:
                duration_fit = False
        if duration_fit:
            new_files.append((files[x], duration))
    else:
        new_files.append((files[x], duration))
    update_progress(x / float(size))

print("\nSorting files by length...")


def func(element):
    return element[1]


new_files.sort(key=func)

print("Saving new manifest...")

with io.FileIO(args.output_path, 'w') as f:
    for file_path in new_files:
        sample = file_path[0].strip() + '\n'