def download_quandl_dataset(quandl_api_key, database, dataset, save_path, columns, tickers, start_date, end_date):
    """
    Download a dataset from Quandl and save it to `save_path`.
    Filter by columns, tickers, and date
    :param quandl_api_key: The Quandl API key
    :param database: The Quandl database to download from
    :param dataset: The dataset to download
    :param save_path: The path to save the dataset
    :param columns: The columns to save
    :param tickers: The tickers to save
    :param start_date: The rows to save that are older than this date
    :param end_date: The rows to save that are younger than this date
    """
    scrape_url = 'https://www.quandl.com/api/v3/datatables/{}/{}?qopts.export=true&api_key={}'\
        .format(database, dataset, quandl_api_key)
    scrape_request = requests.get(scrape_url)
    bulk_download_url = scrape_request.json()['datatable_bulk_download']['file']['link']

    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_wiki_file = tmp_dir + 'tmp.zip'

        bulk_download_request = requests.get(bulk_download_url, stream=True, cookies=scrape_request.cookies)
        total_size = int(bulk_download_request.headers.get('content-length', 0));
        block_size = 1024 * 1024
        with open(tmp_wiki_file, 'wb') as f:
            for data in tqdm(
                    bulk_download_request.iter_content(block_size),
                    total=math.ceil(total_size // block_size),
                    unit='MB',
                    unit_scale=True,
                    desc='Downloading Data'):
                f.write(data)

        with tqdm(total=5, desc='Transforming Data', unit='Action') as pbar:
            # Unzip downloaded data
            zip_ref = zipfile.ZipFile(tmp_wiki_file, 'r')
            zip_ref.extractall(tmp_dir)
            zip_ref.close()
            pbar.update(1)

            # Check if the zip file only contains one csv file
            #   We're assuming that Quandl will always give us the data in a single csv file.
            #   If it's different, we want to throw an error.
            csv_files = glob.glob(os.path.join(tmp_dir, '*.csv'))
            assert len(csv_files) == 1,\
                'Bulk download of Quandl Wiki data failed. Wrong number of csv files found. Found {} file(s).'\
                    .format(len(csv_files))
            tmp_csv_file = csv_files[0]

            tmp_df = pd.read_csv(tmp_csv_file)
            pbar.update(1)
            tmp_df['date'] = pd.to_datetime(tmp_df['date'])
            pbar.update(1)

            # Remove unused data and save
            tmp_df = tmp_df[tmp_df['date'].isin(pd.date_range(start_date, end_date))]  # Filter unused dates
            tmp_df = tmp_df[tmp_df['ticker'].isin(tickers)]  # Filter unused tickers
            pbar.update(1)
            tmp_df.to_csv(save_path, columns=columns, index=False)  # Filter unused columns and save
            pbar.update(1)
def fix_suspects(input_dir, output_dir):
    skipped_items = []
    count = 0

    with open("wonky_unitdate_display_candidates.csv", mode="r") as f:
        example_dict = {}
        reader = csv.reader(f)
        items = list(reader)
        items.reverse()  # reverse the input list so that xpaths remain valid as I edit multiple entries in one ead

    for filename, xpath, text_with_tags, text_without_tags, action in tqdm(items):
        example_dict[filename] = example_dict.get(filename, [])
        example_dict[filename].append((xpath, action, text_without_tags))

    for ead, dict_value_list in tqdm(example_dict.items()):
        tree = etree.parse(os.path.join(input_dir, ead))

        for xpath, action, text in dict_value_list:
            unittitle = tree.xpath(xpath)[0]
            
            #disparity = find_date_disparity(unittitle)
            #if disparity > 10 and action == "move_and_calcify" and ead != "geolsurv.xml":
                #skipped_items.append([ead, xpath, text, disparity, action])
            #else:
                #move_unitdates(unittitle, action)
            move_unitdates(unittitle, action)

        with open(os.path.join(output_dir, ead), mode="w") as f:
            f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True, encoding="utf-8"))

    #with open("skipped_items.csv", mode="wb") as f:
        #writer = csv.writer(f)
        #writer.writerows(skipped_items)

    print("Skipped {0} entries".format(len(skipped_items)))
def run():
    batch_size = 4000

    global signatures
    signatures = get_pickled_signatures()

    pool = avito_utils.PoolWrapper(processes=4)
    name = 'ssim'

    print 'processing train data...'
    t0 = time()
    df = pd.read_csv('../input/ItemPairs_train.csv')
    delete_file_if_exists('features_%s_train.csv' % name)

    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        features = process_batch(batch, pool)
        append_to_csv(features, 'features_%s_train.csv' % name)

    print 'processing train data took %0.5fs' % (time() - t0)

    print 'processinig test data...'
    t0 = time()
    df = pd.read_csv('../input/ItemPairs_test.csv')
    delete_file_if_exists('features_%s_test.csv' % name)

    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        features = process_batch(batch, pool)
        append_to_csv(features, 'features_%s_test.csv' % name)
        
    print 'processing test data took %0.5fs' % (time() - t0)

    pool.close()
Example #4
0
def augment_arrays(project):

    array_path = os.path.join(project['path'], 'array')
    augmented_path = os.path.join(project['path'], 'augmented')
    shutil.rmtree(augmented_path,ignore_errors=True)
    os.makedirs(augmented_path)

    if project['augmentations'] is None:
        print('No augmentations selected: copying train arrays as is.')
        files = os.listdir(array_path)
        for file in tqdm(files):
            shutil.copy(os.path.join(array_path, file),augmented_path)

    else:
        print('Generating image augmentations:')

        for img_idx, (array, label, label_name) in tqdm(enumerate(gen_arrays_from_dir(array_path))):
            split_label_name = '-'.join(label_name.split('-')[2:-1])
            for aug_idx, (array_aug, label_aug) in enumerate(gen_augment_arrays(array, label, project['augmentations'], project['category_rounds'][split_label_name])):
                cat_idx = np.argmax(label_aug)
                cat = project['categories'][cat_idx]
                img_name = '{}-{:02d}-img-{}-{}'.format(img_idx, aug_idx,
                                                            cat, cat_idx)
                label_name = '{}-{:02d}-label-{}-{}'.format(img_idx, aug_idx,
                                                            cat, cat_idx)
                aug_path = os.path.join(augmented_path, img_name)
                label_path = os.path.join(augmented_path, label_name)
                np.save(aug_path, array_aug)
                np.save(label_path, label_aug)

    project['is_augmented'] = True
    return project
Example #5
0
def find_duplicates(directories):
    for d in directories:
        if not os.path.exists(d):
            raise ValueError("Directory %s does not exist" % d)
        elif not os.path.isdir(d):
            raise ValueError("Expected %s to be a directory" % d)

    file_hashes = defaultdict(set)

    print("Scanning for files…")

    all_files = deque()
    for filename in tqdm(find_files(directories)):
        all_files.append(filename)

    print("Hashing %d files" % len(all_files))

    with ThreadPoolExecutor() as executor:
        for filename, digest in tqdm(
            executor.map(get_file_hash, all_files), total=len(all_files)
        ):

            file_hashes[digest].add(filename)

    for digest, filenames in file_hashes.items():
        if len(filenames) < 2:
            continue
        else:
            yield digest, filenames
def run():
    global mongo, scaler
    mongo = MongoWrapper(avito_utils.avito_db)
    scaler = prepare_scaler()

    batch_size = 8000
    name = 'imagemagick'

    pool = avito_utils.PoolWrapper()

    t0 = time()
    df = pd.read_csv('../input/ItemPairs_train.csv')
    delete_file_if_exists('features_%s_train.csv' % name)
    print 'read train set, start processing...'
    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        batch = process_batch(batch, pool)
        append_to_csv(batch, 'features_%s_train.csv' % name)
    print 'processing train set took %0.5fs' % (time() - t0)

    t0 = time()
    df =  pd.read_csv('../input/ItemPairs_test.csv')
    delete_file_if_exists('features_%s_test.csv' % name)
    print 'read test set, start processing...'
    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        batch = process_batch(batch, pool)
        append_to_csv(batch, 'features_%s_test.csv' % name)
    print 'processing test set took %0.5fs' % (time() - t0)

    pool.close()
Example #7
0
    def to_html(self, outdir, template=None):

        pages_set = self.pages_set

        if template is None:
            template = textwrap.dedent("""\
                <html>
                    <head>
                        <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
                        <title>Page {page}</title>
                        <link rel="stylesheet" type="text/css" href="teletext.css" title="Default Style"/>
                        <link rel="alternative stylesheet" type="text/css" href="teletext-noscanlines.css" title="No Scanlines"/>
                        <script type="text/javascript" src="cssswitch.js"></script>
                    </head>
                    <body onload="set_style_from_cookie()">
                    {body}
                    </body>
                </html>
            """)

        for magazineno, magazine in tqdm(self.magazines.items(), desc='Magazines', unit='M'):
            for pageno, page in tqdm(magazine.pages.items(), desc='Pages', unit='P'):
                pagestr = f'{magazineno}{pageno:02x}'
                outfile = open(os.path.join(outdir, f'{pagestr}.html'), 'w')
                body = '\n'.join(
                    subpage.to_html(pages_set) for n, subpage in sorted(page.subpages.items())
                )
                outfile.write(template.format(page=pagestr, body=body))
def test_word2id():
    """把测试集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing eval data.')
    df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4],
                          names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('test question number %d' % len(df_eval))
    # 没有 title 的问题用 content 来替换
    na_title_indexs = list()
    for i in xrange(len(df_eval)):
        word_title = df_eval.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d test questions without title.' % len(na_title_indexs))
    for na_index in na_title_indexs:
        df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content']
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(xrange(len(df_eval))):
        word_content = df_eval.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d test questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title']
    # 转为 id 形式
    p = Pool()
    eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values))
    np.save('../data/wd_eval_title.npy', eval_title)
    eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values))
    np.save('../data/wd_eval_content.npy', eval_content)
    p.close()
    p.join()
    print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))
def train_word2id():
    """把训练集的所有词转成对应的id。"""
    time0 = time.time()
    print('Processing train data.')
    df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4],
                           names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
    print('training question number %d ' % len(df_train))
    # 没有 content 的问题用 title 来替换
    na_content_indexs = list()
    for i in tqdm(xrange(len(df_train))):
        word_content = df_train.word_content.values[i]
        if type(word_content) is float:
            na_content_indexs.append(i)
    print('There are %d train questions without content.' % len(na_content_indexs))
    for na_index in tqdm(na_content_indexs):
        df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title']
    # 没有 title 的问题, 丢弃
    na_title_indexs = list()
    for i in xrange(len(df_train)):
        word_title = df_train.word_title.values[i]
        if type(word_title) is float:
            na_title_indexs.append(i)
    print('There are %d train questions without title.' % len(na_title_indexs))
    df_train = df_train.drop(na_title_indexs)
    print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
    # 转为 id 形式
    p = Pool()
    train_title = np.asarray(p.map(get_id4words, df_train.word_title.values))
    np.save('../data/wd_train_title.npy', train_title)
    train_content = np.asarray(p.map(get_id4words, df_train.word_content.values))
    np.save('../data/wd_train_content.npy', train_content)
    p.close()
    p.join()
    print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))
def merge_po_files(po_files, fuzzy=False):
    """Find each po file from the current directory, group them by name,
    and replicate known translations from each one to the others.
    """
    """Given a list of po files, replicate known translation from each
    file to the others.
    """
    known_translations = {}
    # Aggregate all known translations
    for po_file in tqdm(po_files, desc="Searching known translations"):
        po_file = polib.pofile(po_file)
        for entry in po_file:
            if 'fuzzy' not in entry.flags and entry.msgstr != '':
                known_translations[entry.msgid] = entry.msgstr
    # Propagate them
    done = 0
    for po_file in tqdm(po_files, desc="Replicating them"):
        po_file = polib.pofile(po_file)
        for entry in po_file:
            if entry.msgid in known_translations:
                entry.msgstr = known_translations[entry.msgid]
            elif fuzzy:
                best_match = find_best_match(list(known_translations.keys()),
                                             entry.msgid)
                if best_match is not None:
                    print("I think\n  {}\n =\n  {}".format(entry.msgid,
                                                           best_match))
                    entry.msgstr = known_translations[best_match]
                    entry.flags.append('fuzzy')
        po_file.save()
Example #11
0
File: SSAH.py Project: StatML/SSAH
 def generate_code(self, Modal, bit, generate):
     batch_size = 128
     if generate=="label":
         num_data = Modal.shape[0]
         index = np.linspace(0, num_data - 1, num_data).astype(int)
         B = np.zeros([num_data, bit], dtype=np.float32)
         for iter in tqdm(xrange(num_data / batch_size + 1)):
             ind = index[iter * batch_size: min((iter + 1) * batch_size, num_data)]
             label = Modal[ind, :].astype(np.float32)
             label = label.reshape([label.shape[0], 1, label.shape[1], 1])
             Hsh_L = self.Hsh_L.eval(feed_dict={self.ph['label_input']: label})
             B[ind, :] = Hsh_L
     elif generate=="image":
         num_data = len(Modal)
         index = np.linspace(0, num_data - 1, num_data).astype(int)
         B = np.zeros([num_data, bit], dtype=np.float32)
         for iter in tqdm(xrange(num_data / batch_size + 1)):
             ind = index[iter * batch_size: min((iter + 1) * batch_size, num_data)]
             mean_pixel = np.repeat(self.meanpix[:, :, :, np.newaxis], len(ind), axis=3)
             image = Modal[ind,:,:,:].astype(np.float64)
             image = image - mean_pixel.astype(np.float64).transpose(3, 0, 1, 2)
             Hsh_I = self.Hsh_I.eval(feed_dict={self.ph['image_input']: image})
             B[ind, :] = Hsh_I
     else:
         num_data = Modal.shape[0]
         index = np.linspace(0, num_data - 1, num_data).astype(int)
         B = np.zeros([num_data, bit], dtype=np.float32)
         for iter in tqdm(xrange(num_data / batch_size + 1)):
             ind = index[iter * batch_size: min((iter + 1) * batch_size, num_data)]
             text = Modal[ind, :].astype(np.float32)
             text = text.reshape([text.shape[0], 1, text.shape[1], 1])
             Hsh_T = self.Hsh_T.eval(feed_dict={self.ph['text_input']: text})
             B[ind, :] = Hsh_T
     B = np.sign(B)
     return B
Example #12
0
def read_raw_docs(lines: List[str], size: int, workers: int) -> np.ndarray:
    if size == -1:
        size = len(lines)
    lines = lines[:size]
    documents = np.empty(size, dtype=object)
    memory_impact = sum([sys.getsizeof(s) for s in lines])
    # jeopardy 32862372
    # recipes  187414159
    if memory_impact < 50000000:
        offset = 0
        linebins = np.array_split(lines, workers)  # this is the offending large memory line
        with concurrent.futures.ProcessPoolExecutor() as executor:
            futures = {executor.submit(clean_text, linebins[i]): i
                       for i in range(workers)}
            for future in tqdm(concurrent.futures.as_completed(futures),
                               desc='Tokenizing Documents', total=workers, leave=True):
                index = futures[future]
                for i, line in enumerate(future.result()):
                    documents[offset + i] = line
                offset += len(future.result())
    else:
        print('Use Large Memory Algorithm')
        offset = 0
        with concurrent.futures.ProcessPoolExecutor() as executor:
            futures = {executor.submit(clean_line, lines[i]): i
                       for i in range(size)}
            for future in tqdm(concurrent.futures.as_completed(futures),
                               desc='Tokenizing Documents', total=size, leave=True):
                documents[offset] = future.result()
                offset += 1
    return documents
Example #13
0
def pro_progess(filepath="../data"):
    height = 299
    train_files = os.listdir(filepath + '/train')
    train = np.zeros((len(train_files), height, height, 3), dtype=np.uint8)
    labels = list(filter(lambda x: x[:3] == 'dog', train_files))

    test_files = os.listdir(filepath + '/test')
    test = np.zeros((len(test_files), height, height, 3), dtype=np.uint8)

    for i in tqdm(range(len(train_files))):
        filename = filepath + train_files[i]
        img = cv2.imread(filename)
        img = cv2.resize(img, (height, height))
        train[i] = img[:, :, ::-1]

    for i in tqdm(range(len(test_files))):
        filename = filepath + test_files[i]
        img = cv2.imread(filename)
        img = cv2.resize(img, (height, height))
        test[i] = img[:, :, ::-1]

    print ('Training Data Size = %.2 GB' % (sys.getsizeof(train)/1024**3))
    print ('Testing Data Size = %.2 GB' % (sys.getsizeof(test)/1024**3))
    X_train, X_val, y_train, y_val = train_test_split(
        train, labels, shuffle=True, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val
def run():

    textfiles = glob.glob('anjuke_new_house/*txt')
    if len(textfiles) != 0:
        print ">> compress files under anjuke_new_house"
        f = zipfile.ZipFile('anjuke_new_house/anjuke_new_house.zip', 'w', zipfile.ZIP_DEFLATED)
        for textfile in tqdm(textfiles):
            f.write(textfile)
            os.remove(textfile)
        f.close()

    textfiles = glob.glob('anjuke_second_house/*txt')
    if len(textfiles) != 0:
        print ">> compress files under anjuke_second_house"
        f = zipfile.ZipFile('anjuke_second_house/anjuke_second_house.zip', 'w', zipfile.ZIP_DEFLATED)
        for textfile in tqdm(textfiles):
            f.write(textfile)
            os.remove(textfile)
        f.close()

    textfiles = glob.glob('anjuke_renting_house/*txt')
    if len(textfiles) != 0:
        print ">> compress files under anjuke_renting_house"
        f = zipfile.ZipFile('anjuke_renting_house/anjuke_renting_house.zip', 'w', zipfile.ZIP_DEFLATED)
        for textfile in tqdm(textfiles):
            f.write(textfile)
            os.remove(textfile)
        f.close()
def createDataTxt(imagePath, annotationPath, imagesInDir, split=False):
    JPG = '.jpg'
    TRAINING = 'training/'
    VALIDATION = 'validation/'

    if split:
        annotatedImages = os.listdir(annotationPath)
        # np.random.shuffle(annotatedImages)
        splitSize = ceil(len(annotatedImages) * 0.85)

        annotatedImagesTrain = annotatedImages[:splitSize]
        annotatedImagesValidation = annotatedImages[splitSize:]
    else:
        annotatedImagesTrain = os.listdir(join(annotationPath, TRAINING))
        annotatedImagesValidation = os.listdir(join(annotationPath, VALIDATION))

    with open(imagesInDir + 'train.txt', 'w') as file:
        for ann in tqdm(annotatedImagesTrain, desc='Writing train.txt for input dataset'):
            if isfile(join(imagePath, TRAINING, splitext(ann)[0]) + JPG):
                file.write(' '.join(
                    [join(imagePath, TRAINING, splitext(ann)[0]) + JPG,
                     join(annotationPath, TRAINING, ann)]) + '\n')

    with open(imagesInDir + 'val.txt', 'w') as file:
        for annv in tqdm(annotatedImagesValidation, desc='Writing valid.txt for input dataset'):
            if isfile(join(imagePath, VALIDATION, splitext(annv)[0]) + JPG):
                file.write(' '.join(
                    [join(imagePath, VALIDATION, splitext(annv)[0]) + JPG,
                     join(annotationPath, VALIDATION, annv)]) + '\n')

    return
Example #16
0
def store_contents(data_path, save_path, preprocess, num_workers=None):
    """Preprocess and store a corpus of documents in sqlite.

    Args:
        data_path: Root path to directory (or directory of directories) of files
          containing json encoded documents (must have `id` and `text` fields).
        save_path: Path to output sqlite db.
        preprocess: Path to file defining a custom `preprocess` function. Takes
          in and outputs a structured doc.
        num_workers: Number of parallel processes to use when reading docs.
    """
    if os.path.isfile(save_path):
        raise RuntimeError('%s already exists! Not overwriting.' % save_path)

    logger.info('Reading into database...')
    conn = sqlite3.connect(save_path)
    c = conn.cursor()
    c.execute("CREATE TABLE documents (id PRIMARY KEY, text);")

    workers = ProcessPool(num_workers, initializer=init, initargs=(preprocess,))
    files = [f for f in iter_files(data_path)]
    count = 0
    with tqdm(total=len(files)) as pbar:
        for pairs in tqdm(workers.imap_unordered(get_contents, files)):
            count += len(pairs)
            c.executemany("INSERT INTO documents VALUES (?,?)", pairs)
            pbar.update()
    logger.info('Read %d docs.' % count)
    logger.info('Committing...')
    conn.commit()
    conn.close()
Example #17
0
def preprocess_simple_predict():
    df = pd.read_csv('data/data_full.csv')
    df = df[df.is_fake==0]
    res_df = df.ID.values
    df_target = df[df.target > 0].drop('ID,is_train,is_fake'.split(','), axis=1)
    target = df_target.target.values
    data = df_target.drop(['target',], axis=1).values.astype(int)
    val_sum = {}
    for i, dat in tqdm(enumerate(data)):
        for d in dat:
            if d <= 0:
                continue
            if d not in val_sum:
                val_sum[d] = [0, 0]
            val_sum[d][0] += target[i]
            val_sum[d][1] += 1
    df['simple_predict'] = 0
    for i, row in tqdm(df.drop('ID,is_train,is_fake,target'.split(','), axis=1).iterrows()):
        summ = 0
        cnt = 0.000001
        for val in row:
            if val not in val_sum or val_sum[val][1] < 10:
                continue
            summ += val_sum[val][0]
            cnt += val_sum[val][1]
        df.loc[i, 'simple_predict'] = summ / cnt
    df[['ID', 'simple_predict']].to_csv('data/feat_simple_predict.csv', index=False)
Example #18
0
def test_pandas_groupby_apply():
    """ Test pandas.DataFrame.groupby(...).progress_apply """
    try:
        from numpy.random import randint
        from tqdm import tqdm_pandas
        import pandas as pd
    except:
        raise SkipTest

    with closing(StringIO()) as our_file:
        df = pd.DataFrame(randint(0, 50, (500, 3)))
        dfs = pd.DataFrame(randint(0, 50, (500, 3)),
                           columns=list('abc'))
        tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True))
        df.groupby(0).progress_apply(lambda x: None)
        tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True))
        dfs.groupby(['a']).progress_apply(lambda x: None)

        our_file.seek(0)

        # don't expect final output since no `leave` and
        # high dynamic `miniters`
        nexres = '100%|##########|'
        if nexres in our_file.read():
            our_file.seek(0)
            raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format(
                nexres, our_file.read()))
Example #19
0
def process_ngram(ngram_generator, limit=LIMIT):
    result = {}

    print 'Processing ngrams:'
    for ngram in tqdm(ngram_generator):
        if ngram not in result:
            result[ngram] = 1

        else:
            result[ngram] += 1

    print 'Removing results with n=1:'
    to_remove = set()
    for ngram in tqdm(result):
        if result[ngram] <= 1:
            to_remove.add(ngram)

    for ngram in to_remove:
        del result[ngram]

    sorted_result = sorted(result.items(), lambda x, y: result[x[0]].__cmp__(result[y[0]]), reverse=True)

    if limit:
        return sorted_result[:limit]

    else:
        return sorted_result
	def getFeatures(self):
		files = glob.glob(self.objectPath+self.preProcessedData+'*.npy')
		split_length = None

		if self.windowSize != "None":
			split_length = self.windowSize * self.samplingFrequency

		split_based = open(self.objectPath+self.dataFeatures+self.featureExtracted, 'w', newline='')
		writer = csv.writer(split_based, delimiter=',')
		header_writen = False
		for file in tqdm(files):
			file_split = file.split('_')
			recording_class = file_split[2]
			recording = np.load(file)
			i = 0
			for channel in tqdm(recording):
				if self.windowSize == "None":
					split_length = len(channel)
				limit = int(len(channel)/split_length)*split_length
				channel = channel[0:limit]
				splits = np.split(channel,limit//split_length)
				j = 1
				for split in tqdm(splits):
					self.channel_data = split
					data_ = self.runPipeline()
					temp = [file_split[0],recording_class,self.channels[i],j]
					features = list(data_[0])
					if not header_writen:
						writer.writerow(  ['filename','experiment_identifier','channel_name','split_number'] + list(data_[1]) )
						header_writen = True
					writer.writerow(temp+features)
					#break
					j += 1
					#break
				i += 1
Example #21
0
    def clone(self, url=None, update_existing_config=False):
        """Clone a data store

        Parameters
        ----------
        url : str
            URL of the data store to clone
        update_existing_config : bool
            If True, updates the existing config file to point to the given
            file for the store configuration
        """
        from tqdm import tqdm
        try:
            makedirs(self.powdir)
            print('Cloning...', file=sys.stderr)
            with tqdm(file=sys.stderr, unit=' objects', miniters=0) as progress:
                self.repository_provider.clone(url, base=self.powdir, progress=progress)
            if not exists(self.config_file):
                self._init_config_file()
            self._init_store()
            print('Deserializing...', file=sys.stderr)
            with tqdm(unit=' ctx', file=sys.stderr) as ctx_prog, \
                    tqdm(unit=' triples', file=sys.stderr, leave=False) as trip_prog:
                self._load_all_graphs(ctx_prog, trip_prog)
            print('Done!', file=sys.stderr)
        except BaseException as e:
            self._ensure_no_powdir()
            raise e
Example #22
0
    def predict_kfold(cls, X, y, n_folds=10, seed=0, textModel_params={},
                      kfolds=None, pool=None, use_tqdm=True):
        try:
            from tqdm import tqdm
        except ImportError:
            def tqdm(x, **kwargs):
                return x

        le = preprocessing.LabelEncoder().fit(y)
        y = np.array(le.transform(y))
        hy = np.zeros(len(y), dtype=np.int)
        if kfolds is None:
            kfolds = StratifiedKFold(n_splits=n_folds, shuffle=True,
                                     random_state=seed).split(X, y)
        args = [(X, y, tr, ts, textModel_params) for tr, ts in kfolds]
        if pool is not None:
            if use_tqdm:
                res = [x for x in tqdm(pool.imap_unordered(cls.train_predict_pool, args),
                                       desc='Params', total=len(args))]
            else:
                res = [x for x in pool.imap_unordered(cls.train_predict_pool, args)]
        else:
            if use_tqdm:
                args = tqdm(args)
            res = [cls.train_predict_pool(x) for x in args]
        for ts, _hy in res:
            hy[ts] = _hy
        return le.inverse_transform(hy)
Example #23
0
def normalize_features(X_train, X_test):
    n_features = X_train.shape[1]

    feature_sums = np.sum(X_test, axis=1)
    nonblack_vectors = np.where(feature_sums > 0,1,0)
    #print nonblack_vectors.shape

    mask = []
    for x in range(X_test.shape[0]):
        mask.append([nonblack_vectors[x]]*n_features)
    mask = np.array(mask)

    X_test_nonblack = X_test[np.where(feature_sums > 0)]

    X = np.concatenate((X_train, X_test_nonblack))
    #print X, X.shape

    mean = np.mean(X,axis=0)
    std = np.std(X,axis=0)

    for d in tqdm(range(len(X_train))):
        X_train[d] = (X_train[d] - mean) / std
    for d in tqdm(range(len(X_test))):
        X_test[d] = (X_test[d] - mean) / std

    #Make once fully black vectors fully black again
    X_test = X_test*mask

    return X_train, X_test
Example #24
0
def scan_dir(path, dir_json):
    # Preprocess the total files count
    for root, dirs, files in tqdm(os.walk(path)):
        for name in files:
            path = os.path.join(root, name)
            if os.path.getsize(path) > (25*1024*1024):
                ext = os.path.splitext(name)[1]
                if ext in EXT:
                    movie_name.append(name)

    with tqdm(total=len(movie_name), leave=True, unit='B',
              unit_scale=True) as pbar:
        for name in movie_name:
            data = get_movie_info(name)
            pbar.update()
            if data is not None and data['Response'] == 'True':
                for key, val in data.items():
                    if val == "N/A":
                        data[key] = "-"  # Should N/A be replaced with `-`?
                movies.append(data)
            else:
                if data is not None:
                    movie_not_found.append(name)
        with open(dir_json, "w") as out:
            json.dump(movies, out, indent=2)
Example #25
0
def compare_assemblies(assemblies, chunk_size = 2000, identity_threshold = 0.40):
    """
    compares a set of assemblies:
    assemblies is a dictionary with names of the assemblies as keys and fasta-files of the assemblies as values
    """
    similarities = {}


    print "make blast dbs"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_cmd = ["makeblastdb" ,"-in", subject, "-dbtype", "nucl", "-out", subject]
        with open("/dev/null") as null:
            blastdb_return = call(blast_db_cmd, stdout=null)

    print "Run the hell out of it"
    for scaff_name, scaff in tqdm(assemblies.iteritems()):
        similarities[scaff_name] = {}
        chopped_up_query = "tmp.fasta"
        nb_chunks = len(cut_up_fasta(scaff, chopped_up_query, chunk_size))
        for subject_name, subject in assemblies.iteritems():
            nics = find_NICs(chopped_up_query, subject, identity_threshold, blast_db = False)
#            print scaff_name, "vs", subject_name
            similarities[scaff_name][subject_name] = len(nics.keys())/nb_chunks
    os.remove(chopped_up_query)

    print "clean up"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_files = [subject + ".nhr", subject + ".nin",  subject + ".nsq"]
        for f in blast_db_files:
            os.remove(f)


    similars =  DataFrame.from_dict(similarities)
    return similars
Example #26
0
def run(*args):
    """Reset the in_stock Card property. It was set to True by default, it
    should be False. So each card that was bought once or added from
    an inventory should be to True.
    """

    yes_answers = ["y", "Y", "o", "O", ""]
    go_all_cards = raw_input("Go with all cards ? [Y/n]")
    go_inventories = raw_input("Go with cards applied from inventories ? [Y/n]")

    if go_all_cards in yes_answers:
        print("Setting all cards to not in stock...")
        for card in tqdm(Card.objects.all()):
            card.in_stock = False
            card.save()

    if go_inventories in yes_answers:
        print("Registering cards applied from inventories...")
        for inv in tqdm(Inventory.objects.filter(applied=True)):
            print("Going with inv {}".format(inv.name))
            for card_set in inv.inventorycopies_set.all():
                card_set.card.in_stock = True
                card_set.card.save()

    print("All done.")
def run():
    batch_size = 4000

    print 'reading image hashes from image_hashes.csv...',
    t0 = time()
    global df_hashes
    df_hashes = pd.read_csv('image_hashes.csv')
    df_hashes.set_index('image_id', inplace=1)
    print 'took %0.5fs' % (time() - t0)

    pool = avito_utils.PoolWrapper(processes=4)

    print 'processing train data...'
    t0 = time()
    df = pd.read_csv('../input/ItemPairs_train.csv')
    delete_file_if_exists('features_imagehash_train.csv')

    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        features = process_batch(batch, pool)
        append_to_csv(features, 'features_imagehash_train.csv')

    print 'processing train data took %0.5fs' % (time() - t0)

    print 'processinig test data...'
    t0 = time()
    df = pd.read_csv('../input/ItemPairs_test.csv')
    delete_file_if_exists('features_imagehash_test.csv')

    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        features = process_batch(batch, pool)
        append_to_csv(features, 'features_imagehash_test.csv')

    print 'processing test data took %0.5fs' % (time() - t0)

    pool.close()
Example #28
0
def make_tqdm_iterator(**kwargs):
    options = {
        "file": sys.stdout,
        "leave": True
    }
    options.update(kwargs)

    if session_type() == 'kernel':
        # from IPython import display
        # capture_stderr = StringIO()
        # with RedirectStdStreams(stderr=capture_stderr):
            # try:
                # iterator = tqdm_notebook(**options)
            # except:
                # failed = True
            # else:
                # failed = False
                # err_out = capture_stderr.getvalue()
        # capture_stderr.close()
        # if failed or err_out.lower().find("widget javascript not detected") > -1:
            # display.clear_output(wait=True)
            # iterator = tqdm(**options)
        iterator = tqdm(**options)

    else:
        iterator = tqdm(**options)
    return iterator
Example #29
0
def download_url(url, root, filename, md5):
    from six.moves import urllib

    root = os.path.expanduser(root)
    fpath = os.path.join(root, filename)

    try:
        os.makedirs(root)
    except OSError as e:
        if e.errno == errno.EEXIST:
            pass
        else:
            raise

    # downloads file
    if os.path.isfile(fpath) and check_integrity(fpath, md5):
        print('Using downloaded and verified file: ' + fpath)
    else:
        try:
            print('Downloading ' + url + ' to ' + fpath)
            urllib.request.urlretrieve(
                url, fpath,
                reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True))
            )
        except:
            if url[:5] == 'https':
                url = url.replace('https:', 'http:')
                print('Failed download. Trying https -> http instead.'
                      ' Downloading ' + url + ' to ' + fpath)
                urllib.request.urlretrieve(
                    url, fpath,
                    reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True))
                )
Example #30
0
def test_ascii():
    """ Test ascii/unicode bar """
    # Test ascii autodetection
    with closing(StringIO()) as our_file:
        with tqdm(total=10, file=our_file, ascii=None) as t:
            assert t.ascii  # TODO: this may fail in the future

    # Test ascii bar
    with closing(StringIO()) as our_file:
        for _ in tqdm(_range(3), total=15, file=our_file, miniters=1,
                      mininterval=0, ascii=True):
            pass
        our_file.seek(0)
        res = our_file.read().strip("\r").split("\r")
    assert '7%|6' in res[1]
    assert '13%|#3' in res[2]
    assert '20%|##' in res[3]

    # Test unicode bar
    with closing(UnicodeIO()) as our_file:
        with tqdm(total=15, file=our_file, ascii=False, mininterval=0) as t:
            for _ in _range(3):
                t.update()
        our_file.seek(0)
        res = our_file.read().strip("\r").split("\r")
    assert "7%|\u258b" in res[1]
    assert "13%|\u2588\u258e" in res[2]
    assert "20%|\u2588\u2588" in res[3]
Example #31
0
def paramsearch(binned,
                samples_per_knot,
                n_valid_samples,
                n_train_folds=3,
                n_valid_folds=1,
                n_test_folds=1,
                knot_range=(-1, 2),
                smoothness_range=(1e-2, 1e2),
                warpreg_range=(1e-2, 1e1),
                iter_range=(50, 300),
                warp_iter_range=(50, 300),
                outfile=None):
    """
    Performs nested cross-validation over shift-only, linear, and
    piecewise linear warping models, in order to tune all hyperparmeters
    and compare performance. For each set of randomly sampled parameters,
    trials and units are randomly split `n_folds` times into train/test
    groups. An R-squared metric of across-trial reliability is measured
    on each test set; larger scores indicate warping functions that generalize
    better.

    Parameters
    ----------
    binned : ndarray
        trials x timepoints x neurons binned spikes
    samples_per_knot : int
        Number of cross-validation runs per knot.
    n_valid_samples : int
        Number of inner samples to optimize smoothness and warp
        complexity regularization parameters on validation set.
    n_train_folds : int
        Number of folds used for training.
    n_valid_folds : int
        Number of folds used for validation.
    n_test_folds : int
        Number of folds used for testing.
    knot_range : tuple of ints
        Specifies [minimum, maximum) number of knots in warping
        functions. A value of -1 denotes a shift-only
        warping model; a value of 0 denotes a linear warping model (no
        interior knots); etc.
    smoothness_range : tuple of floats
        Specifies [minimum, maximum) strength of regularization on
        template smoothness; larger values penalize roughness over time
        more stringently. The regularization strength for each model
        is randomly sampled from a log-uniform distribution over this
        interval.
    warpreg_range : tuple of floats
        Specifies [minimum, maximum) strength of regularization on the
        area between the warping functions and the identity line;
        larger values penalize warping more stringently. The
        regularization strength for each model is randomly sampled from
        a log-uniform distribution over this interval.
    iter_range : tuple of ints
        Specifies [minimum, maximum) number of iterations used to optimize
        each model, which are sampled log-uniformly over this interval
        and constrained to be integer-valued.
    warp_iter_range : tuple of ints
        Specifies [minimum, maximum) number of inner iterations to apply
        to update the warping functions on each step of optimization.
        These are also randomly sampled log-uniformly over the specified
        interval.

    outfile : None or str (optional)
        If provided, data are saved after each iteration to this filename.

    Returns
    -------
    results : dict
        Dictionary holding results:

        "knots" : (n_samples,) array holding number of knots in piecewise
            linear warping function for each evaluated model.

        "smoothness" : (n_samples, n_valid_samples) array holding sampled
            regularization strengths on warping templates, penalizing
            roughness.

        "warp_reg" : (n_samples, n_valid_samples) array holding sampled
            regularization strengths on warping function distance from
            identity.

        "iterations" : (n_samples, n_valid_samples) array holding number
            of model optimization steps.

        "warp_iterations" : (n_samples, n_valid_samples) array holding number
            of inner iteration steps for fitting warping functions.

        "train_rsq": (n_samples, n_valid_samples) array holding model
            performance on the training set.

        "valid_rsq": (n_samples, n_valid_samples) array holding model
            performance on the validation set.

        "test_rsq": (n_samples,) array holding model performance on the
            test set.

        "loss_hists" : (n_samples, n_valid_samples, n_iterations + 1) array
            holding the learning curves for all models. The loss is computed
            over the combined train and validation set.

    Notes
    -----
    Only implemented for quadratic loss.
    """

    # Dataset dimensions (trials x timepoints x units).
    K, T, N = binned.shape

    # Randomly draw all parameter settings for each model.
    knots = np.tile(np.arange(*knot_range), samples_per_knot)
    n_samples = len(knots)

    smoothness = _sample_log_uniform(smoothness_range,
                                     size=(n_samples, n_valid_samples))
    warp_reg = _sample_log_uniform(warpreg_range,
                                   size=(n_samples, n_valid_samples))
    iterations = _sample_log_uniform(iter_range,
                                     size=(n_samples,
                                           n_valid_samples)).astype('int')
    warp_iterations = _sample_log_uniform(warp_iter_range,
                                          size=(n_samples,
                                                n_valid_samples)).astype('int')

    # Initialize arrays to store losses.
    train_rsq = np.empty((n_samples, n_valid_samples))
    valid_rsq = np.full((n_samples, n_valid_samples), -np.inf)
    test_rsq = np.empty(n_samples)
    loss_hists = np.full((n_samples, n_valid_samples, iter_range[1]), np.nan)

    progress_bar = tqdm(total=n_samples * n_valid_samples)

    for i, j in itertools.product(range(n_samples), range(n_valid_samples)):

        # Update train - validation - test sets.
        if j == 0:
            train_units, val_units, test_units = _crossval_partition(
                N, n_train_folds, n_valid_folds, n_test_folds)
            train_trials, val_trials, test_trials = _crossval_partition(
                K, n_train_folds, n_valid_folds, n_test_folds)

        # Create model instance.
        model_kw = {
            "smoothness_reg_scale": smoothness[i, j],
            "warp_reg_scale": warp_reg[i, j]
        }
        if knots[i] == -1:
            model = ShiftWarping(**model_kw)
        else:
            model = PiecewiseWarping(n_knots=knots[i], **model_kw)

        # Fit model.
        fit_kw = {
            "verbose": False,
            "iterations": iterations[i, j],
            "warp_iterations": warp_iterations[i, j],
            "neuron_idx": train_units,
            "trial_idx": train_trials,
        }
        model.fit(binned, **fit_kw)

        # Store optimization learning curve.
        loss_hists[i, j, :(iterations[i, j] + 1)] = model.loss_hist

        # Create baseline model (simple trial average).
        baseline_pred = np.tile(
            np.mean(binned[train_trials], axis=0, keepdims=True),
            (binned.shape[0], 1, 1))

        # Record loss on training set.
        pred = model.predict()
        train_rsq[i, j] = 1 - (
            _crossval_loss(pred, binned, train_trials, train_units) /
            _crossval_loss(baseline_pred, binned, train_trials, train_units))

        # Record loss on validation set.
        valid_rsq[i, j] = 1 - (
            _crossval_loss(pred, binned, val_trials, val_units) /
            _crossval_loss(baseline_pred, binned, val_trials, val_units))

        # Save loss on test set if validation loss is optimal
        if np.argmax(valid_rsq[i]) == j:
            test_rsq[i] = 1 - (
                _crossval_loss(pred, binned, test_trials, test_units) /
                _crossval_loss(baseline_pred, binned, test_trials, test_units))

        # Save results.
        if j == n_valid_samples - 1:
            results = {
                "knots": knots[:(i + 1)],
                "smoothness": smoothness[:(i + 1)],
                "warp_reg": warp_reg[:(i + 1)],
                "iterations": iterations[:(i + 1)],
                "warp_iterations": warp_iterations[:(i + 1)],
                "train_rsq": train_rsq[:(i + 1)],
                "valid_rsq": valid_rsq[:(i + 1)],
                "test_rsq": test_rsq[:(i + 1)],
                "loss_hists": loss_hists[:(i + 1)],
            }
            if outfile is not None:
                np.savez(outfile, **results)

        # Update progress bar.
        progress_bar.update(1)

    return results
Example #32
0
def train_one(
    config: Config,
    train_batches: List[Tuple],
    dev_insts: List[Instance],
    dev_batches: List[Tuple],
    model_name: str,
    test_insts: List[Instance] = None,
    test_batches: List[Tuple] = None,
    config_name: str = None,
    result_filename: str = None,
) -> NNCRF:
    model = NNCRF(config)
    model.train()
    optimizer = get_optimizer(config, model)
    lr_scheduler = SlantedTriangular(optimizer,
                                     config.num_epochs,
                                     num_steps_per_epoch=len(train_batches),
                                     ratio=16)
    epoch = config.num_epochs
    best_dev_f1 = -1
    saved_test_metrics = None
    for i in range(1, epoch + 1):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        # if config.optimizer.lower() == "sgd":
        #     optimizer = lr_decay(config, optimizer, i)
        lr_scheduler.step(epoch=i)

        for index in tqdm(np.random.permutation(len(train_batches)),
                          f"Training epoch {i}", len(train_batches)):
            model.train()
            loss = model(*train_batches[index])
            epoch_loss += loss.item()
            # print(f"Batch loss: {loss.item()}")
            loss.backward()
            optimizer.step()
            model.zero_grad()
            lr_scheduler.step_batch()
        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" %
              (i, epoch_loss / len(train_batches), end_time - start_time),
              flush=True)

        model.eval()
        # metric is [precision, recall, f_score]
        dev_metrics = evaluate_model(config, model, dev_batches, "dev",
                                     dev_insts)
        if test_insts is not None:
            test_metrics = evaluate_model(config, model, test_batches, "test",
                                          test_insts)
        if dev_metrics[2] > best_dev_f1:
            print("saving the best model...")
            best_dev_f1 = dev_metrics[2]
            if test_insts is not None:
                saved_test_metrics = test_metrics
            torch.save(model.state_dict(), model_name)
            # # Save the corresponding config as well.
            if config_name:
                f = open(config_name, "wb")
                pickle.dump(config, f)
                f.close()
            if result_filename:
                write_results(result_filename, test_insts)
        model.zero_grad()
    if test_insts is not None:
        print(f"The best dev F1: {best_dev_f1}")
        print(f"The corresponding test: {saved_test_metrics}")
    return model
Example #33
0
def handle_update(args):
    config = Configuration(use_config_cache=args.config_cache)
    failed_pulls = []
    failed_clones = []

    assert_lfs_installed()

    if not args.no_config:
        logger.info("Updating orchestra configuration")
        if not git_pull(config.orchestra_dotdir):
            failed_pulls.append(f"orchestra configuration ({config.orchestra_dotdir})")

    logger.info("Updating binary archives")
    os.makedirs(config.binary_archives_dir, exist_ok=True)
    progress_bar = tqdm(config.binary_archives_remotes.items(), unit="archives")
    for name, url in progress_bar:
        binary_archive_path = os.path.join(config.binary_archives_dir, name)
        progress_bar.set_postfix_str(f"{name}")
        if os.path.exists(binary_archive_path):
            logger.debug(f"Pulling binary archive {name}")
            if not pull_binary_archive(name, config):
                failed_pulls.append(f"Binary archive {name} ({os.path.join(config.binary_archives_dir, name)})")
        else:
            logger.info(f"Trying to clone binary archive from remote {name} ({url})")
            if not clone_binary_archive(name, url, config):
                failed_clones.append(f"Binary archive {name} ({url})!")

    logger.info("Resetting ls-remote cached info")
    ls_remote_cache = os.path.join(config.cache_dir, "remote_refs_cache.json")
    if os.path.exists(ls_remote_cache):
        os.remove(ls_remote_cache)

    logger.info("Updating ls-remote cached info")
    failed_ls_remotes = config.remote_heads_cache.rebuild_cache(parallelism=args.parallelism)

    to_pull = []
    for _, component in config.components.items():
        if not component.clone:
            continue

        source_path = os.path.join(config.sources_dir, component.name)
        if not os.path.exists(source_path):
            continue

        to_pull.append(component)

    if to_pull:
        logger.info("Updating repositories")
        progress_bar = tqdm(to_pull, unit="components")
        for component in progress_bar:
            source_path = os.path.join(config.sources_dir, component.name)
            logger.debug(f"Pulling {component.name}")
            progress_bar.set_postfix_str(f"{component.name}")

            if not is_root_of_git_repo(source_path):
                failed_pulls.append(f"Repository {component.name}: Directory {source_path} is not a git repo")
                continue

            if not git_pull(source_path):
                failed_pulls.append(f"Repository {component.name}")

    if failed_pulls:
        formatted_failed_pulls = "\n".join([f"  - {repo}" for repo in failed_pulls])
        # Note: f-strings don't account for indentation, using a template is more practical
        failed_git_pull_template = dedent(
            """
            Could not git pull --ff-only the following repositories:
            {formatted_failed_pulls}

            Suggestions:
                - check your network connection
                - commit your work
                - `git pull --rebase`, to pull remote changes and apply your commits on top
                - `git push` your changes to the remotes
            """
        )
        failed_git_pull_suggestion = failed_git_pull_template.format(formatted_failed_pulls=formatted_failed_pulls)
        logger.error(failed_git_pull_suggestion)

    if failed_clones:
        formatted_failed_clones = "\n".join([f"  - {repo}" for repo in failed_clones])
        # Note: f-strings don't account for indentation, using a template is more practical
        failed_git_clone_template = dedent(
            """
            Could not clone the following repositories:
            {formatted_failed_clones}

            Suggestions:
                - check your network connection
                - check your ssh and git configuration (try manually cloning the repositories)
            """
        )
        failed_git_clone_suggestion = failed_git_clone_template.format(formatted_failed_clones=formatted_failed_clones)
        logger.error(failed_git_clone_suggestion)

    if failed_ls_remotes:
        formatted_failed_ls_remotes = "\n".join([f"  - {repo}" for repo in failed_ls_remotes])
        # Note: f-strings don't account for indentation, using a template is more practical
        failed_git_clone_template = dedent(
            """
            Could not find the following repositories in any remote:
            {formatted_failed_ls_remotes}

            You will not be able to install components that depend on them.
            """
        )
        failed_ls_remote_suggestion = failed_git_clone_template.format(
            formatted_failed_ls_remotes=formatted_failed_ls_remotes
        )
        logger.info(failed_ls_remote_suggestion)

    if failed_pulls or failed_clones or failed_ls_remotes:
        return 1
    else:
        return 0
Example #34
0
def main():
    fic_ids, fandom, headers, restart, idlist_is_csv, only_first_chap, output_dirpath = get_args(
    )
    os.chdir(os.getcwd())
    storycolumns = [
        'fic_id', 'title', 'author', 'author_key', 'rating', 'category',
        'fandom', 'relationship', 'character', 'additional tags', 'language',
        'published', 'status', 'status date', 'words', 'comments', 'kudos',
        'bookmarks', 'hits', 'chapter_count', 'series', 'seriespart',
        'seriesid', 'summary', 'preface_notes', 'afterword_notes'
    ]
    chaptercolumns = [
        'fic_id', 'title', 'summary', 'preface_notes', 'afterword_notes',
        'chapter_num', 'chapter_title', 'paragraph_count'
    ]
    textcolumns = ['fic_id', 'chapter_id', 'para_id', 'text']
    if not os.path.exists(workdir(output_dirpath, fandom)):
        os.mkdir(workdir(output_dirpath, fandom))
    if not os.path.exists(contentdir(output_dirpath, fandom)):
        os.mkdir(contentdir(output_dirpath, fandom))
    with open(storiescsv(output_dirpath, fandom), 'a') as f_out:
        storywriter = csv.writer(f_out)
        with open(chapterscsv(output_dirpath, fandom), 'a') as ch_out:
            chapterwriter = csv.writer(ch_out)
            with open(errorscsv(output_dirpath, fandom), 'a') as e_out:
                errorwriter = csv.writer(e_out)
                #does the csv already exist? if not, let's write a header row.
                if os.stat(storiescsv(output_dirpath, fandom)).st_size == 0:
                    print('Writing a header row for the csv.')
                    storywriter.writerow(storycolumns)
                if os.stat(chapterscsv(output_dirpath, fandom)).st_size == 0:
                    print('Writing a header row for the csv.')
                    chapterwriter.writerow(chaptercolumns)
                if idlist_is_csv:
                    csv_fname = fic_ids[0]
                    total_lines = 0

                    # Count fics remaining
                    with open(csv_fname, 'r') as f_in:
                        reader = csv.reader(f_in)
                        for row in reader:
                            if not row:
                                continue
                            total_lines += 1

                    # Scrape fics
                    with open(csv_fname, 'r+') as f_in:
                        reader = csv.reader(f_in)
                        if restart is '':
                            for row in tqdm(reader,
                                            total=total_lines,
                                            ncols=70):
                                if not row:
                                    continue
                                write_fic_to_csv(fandom,
                                                 row[0],
                                                 only_first_chap,
                                                 storywriter,
                                                 chapterwriter,
                                                 errorwriter,
                                                 storycolumns,
                                                 chaptercolumns,
                                                 headers,
                                                 output_dirpath,
                                                 write_whole_fics=True)
                        else:
                            found_restart = False
                            for row in tqdm(reader,
                                            total=total_lines,
                                            ncols=70):
                                if not row:
                                    continue
                                found_restart = process_id(
                                    row[0], restart, found_restart)
                                if found_restart:
                                    write_fic_to_csv(
                                        fandom,
                                        row[0],
                                        only_first_chap,
                                        storywriter,
                                        chapterwriter,
                                        errorwriter,
                                        storycolumns,
                                        chaptercolumns,
                                        headers,
                                        output_dirpath=output_dirpath,
                                        write_whole_fics=True)
                                else:
                                    print('Skipping already processed fic')

                else:
                    for fic_id in fic_ids:
                        write_fic_to_csv(fandom,
                                         fic_id,
                                         only_first_chap,
                                         storywriter,
                                         chapterwriter,
                                         errorwriter,
                                         storycolumns,
                                         chaptercolumns,
                                         headers,
                                         output_dirpath=output_dirpath,
                                         write_whole_fics=True)
Example #35
0
from sk_dataloader_inter import DatasetCells, CellTrainData
import pandas as pd
import torch
from tqdm import tqdm
from PIL import Image
from imgaug import augmenters as iaa
import numpy as np
from skimage import exposure

data = DatasetCells(pd.read_csv("../data/GenData/train_input_ids.csv"),
                    pd.read_csv("../data/GenData/train_labels_ids.csv"),
                    pd.read_csv("../data/GenData/train_inter_ids.csv"))

print("Adjusting Exposure..")
for j, sample in tqdm(enumerate(data), total=len(data)):
    img, label, inter = sample
    new_img = exposure.adjust_gamma(img, gamma=0.4, gain=0.9)
    new_img = Image.fromarray(new_img)
    label = Image.fromarray(label)
    inter = Image.fromarray(inter)
    new_img.save("../data/GenData/TrainData/images/" + str("%04d" % j) +
                 "_exposure_.png")
    label.save("../data/GenData/TrainData/labels/" + str("%04d" % j) +
               "_exposure_.png")
    inter.save("../data/GenData/TrainData/watershed/" + str("%04d" % j) +
               "_exposure_.png")
print("Finished..")
Example #36
0
    def evaluate_coco(test_loader, model):
        """
        Evaluate.

        :param test_loader: DataLoader for test data
        :param model: model
        """
        fig_test, ax_test = plt.subplots(figsize=(18, 15))

        # Make sure it's in eval mode
        model.eval()

        # Lists to store detected and true boxes, labels, scores
        det_boxes = list()
        det_labels = list()
        det_scores = list()
        true_boxes = list()
        true_labels = list()

        #For CoCo
        results = []

        with torch.no_grad():
            # Batches
            for i, (images, boxes, labels,
                    index) in enumerate(tqdm(test_loader, desc='Evaluating')):
                images = images.to(device)

                # Forward prop.
                predicted_locs, predicted_scores = model(images)

                # Detect objects in SSD output
                det_boxes_batch, det_labels_batch, det_scores_batch = model.detect_objects(
                    predicted_locs,
                    predicted_scores,
                    min_score=0.1,
                    max_overlap=0.45,
                    top_k=50)
                # Evaluation MUST be at min_score=0.01, max_overlap=0.45, top_k=200 for fair comparision with the paper's results and other repos

                # Store this batch's results for mAP calculation
                boxes = [b.to(device) for b in boxes]
                labels = [l.to(device) for l in labels]

                for box_t, label_t, score_t, ids in zip(
                        det_boxes_batch, det_labels_batch, det_scores_batch,
                        index):
                    for box, label, score in zip(box_t, label_t, score_t):

                        bb = box.cpu().numpy().tolist()

                        # if score.item() > 0.1 :
                        results.append( {\
                                        'image_id': ids.item(), \
                                        'category_id': label.item(), \
                                        'bbox': [bb[0]*input_size[1], bb[1]*input_size[0], (bb[2]-bb[0])*input_size[1], (bb[3]-bb[1])*input_size[0]], \
                                        'score': score.item()} )

        rstFile = os.path.join(
            checkpoint_root,
            './COCO_TEST_det_{:s}.json'.format(checkpoint_name))
        write_result_coco(results, rstFile)

        # rstFile = os.path.join('./jobs/2019-03-26_16h07m_[SSDPed_512x640][KAISTPed_train-all-02]video_make_test_full/SSDPed_512x640_epoch_0022_det.json')

        try:

            cocoDt = cocoGt.loadRes(rstFile)
            imgIds = sorted(cocoGt.getImgIds())
            cocoEval = COCOeval(cocoGt, cocoDt, annType)
            cocoEval.params.imgIds = imgIds
            cocoEval.params.catIds = [1]
            cocoEval.evaluate(0)
            cocoEval.accumulate()
            curPerf = cocoEval.summarize(0)

            cocoEval.draw_figure(ax_test, rstFile.replace('json', 'jpg'))
            #writer.add_scalars('LAMR/fppi', {'test': curPerf}, epoch)

            print('Recall: {:}'.format(1 - cocoEval.eval['yy'][0][-1]))

        except:
            import torchcv.utils.trace_error
            print('[Error] cannot evaluate by cocoEval. ')
Example #37
0
# -*- coding: utf-8 -*-
"""
Created on Sun Dec  2 18:32:22 2018

@author: DC
"""
from tqdm import tqdm
import pandas as pd
import os

os.chdir("F:\\758Bdata")

sample = pd.DataFrame()
data = pd.read_csv('dentist new.csv')
text = data['text'].tolist()
text.remove(text[0])
repeat = []

for i in tqdm(range(len(text))):
    temp = text[0]
    text.remove(text[0])
    if temp in text:
        repeat.append(temp)

repeat_pd = pd.DataFrame(repeat, columns=['text'])
repeat_data = pd.merge(repeat_pd, data, how='inner', on='text')
repeat_data.drop(repeat_data.columns[1], axis=1, inplace=True)
Example #38
0
    def get_answers(self, question:str, paragraphs:list):
        answers = None

        start_time = time.time()
        dataset, examples, features = convert_format(question, paragraphs, self.tokenizer, self.args)
        print("CONVERT에 걸린시간 : ", time.time()-start_time)

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.batch_size)
        if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(self.model)
        else:
            model = self.model

        all_results = []
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                feature_indices = batch[3]
                outputs = model(**inputs)

            for i, feature_index in enumerate(feature_indices):
                # TODO: i and feature_index are the same number! Simplify by removing enumerate?
                eval_feature = features[feature_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]


                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        temp_dir = 'temp'
        if not os.path.isdir(temp_dir):
            os.mkdir(temp_dir)
        output_prediction_file = os.path.join(temp_dir, "predictions_.json")
        output_nbest_file = os.path.join(temp_dir, "nbest_predictions_.json")
        output_null_log_odds_file = os.path.join(temp_dir, "null_odds_.json")

        #return all_results
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            self.args.n_best_size,
            self.args.max_answer_length,
            self.args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            self.args.verbose_logging,
            self.args.version_2_with_negative,
            self.args.null_score_diff_threshold,
            self.tokenizer,
        )

        return predictions[QUESTION_ID]
Example #39
0
users = msg['users']
msg = total_batch
send_msg(s, msg)   # send total_batch of train dataset


# ## Real training process

# In[21]:


for e in range(epochs):
    for u in range(users):
        client_weights = recv_msg(s)
        ecg_client.load_state_dict(client_weights)
        ecg_client.eval()
        for i, data in enumerate(tqdm(train_loader, ncols=100, desc='Epoch '+str(e+1)+ '_' +str(u))):
            x, label = data
            x = x.to(device)
            label = label.to(device)

            optimizer.zero_grad()
            output = ecg_client(x)
            client_output = output.clone().detach().requires_grad_(True)
            msg = {
                'client_output': client_output,
                'label': label
            }
            send_msg(s, msg)
            client_grad = recv_msg(s)
            output.backward(client_grad)
            optimizer.step()
Example #40
0
def heldout_transform(model,
                      binned,
                      data,
                      transformed_neurons=None,
                      progress_bar=True,
                      **fit_kw):
    """
    Transform each neuron's activity by holding it out of model fitting
    and applying warping functions fit to the remaining neurons.

    Parameters
    ----------
    models : ShiftWarping or AffineWarping instance
        Model to fit
    binned : numpy.ndarray
        Array holding binned spike times (trials x num_timebins x
        neurons)
    data : SpikeData instance
        Raw spike times.
    transformed_neurons (optional) : array-like or ``None``
        Indices of neurons that are transformed. If None, all neurons
        are transformed.
    fit_kw (optional) : dict
        Additional keyword arguments are passed to ``model.fit(...)``.

    Returns
    -------
    aligned_data : SpikeData instance
        Transformed version of ``data`` where each neuron/unit is
        independently aligned.

    Raises
    ------
    ValueError: If ``binned`` and ``data`` have inconsistent dimensions.

    Notes
    -----
    Since a different model is fit for each neuron, the warping
    functions are not necessarily consistent across neurons in the
    returned data array. Thus, each neuron should be considered as
    having its own time axis.
    """

    # broadcast keywords into dict, with model instances as keys
    fit_kw['verbose'] = False

    # data dimensions
    n_neurons = data.n_neurons
    n_trials = data.n_trials
    if (n_trials != binned.shape[0]) or (n_neurons != binned.shape[-1]):
        raise ValueError('Dimension mismatch. Binned data and spike data do '
                         'not have the same number of neurons or trials.')

    # Allocate storage for held out spike times.
    trials, spiketimes, neurons = [], [], []

    # Determine neurons to hold out and fit.
    if transformed_neurons is None:
        transformed_neurons = range(n_neurons)

    # Set up progress bar.
    if progress_bar:
        transformed_neurons = tqdm(transformed_neurons)

    # Hold out each neuron, fit models, and apply transform to heldout cell.
    for n in transformed_neurons:

        # Define training set.
        trainset = list(set(range(n_neurons)) - {n})

        # Fit model.
        model.fit(binned[:, :, trainset], **fit_kw)

        # Apply warping to test set.
        w = model.transform(data.select_neurons([n]))

        # Store result.
        trials.extend(w.trials)
        spiketimes.extend(w.spiketimes)
        neurons.extend(np.full(len(w.trials), n).tolist())

    # Package result into a SpikeData instance.
    return SpikeData(trials, spiketimes, neurons, data.tmin, data.tmax)
Example #41
0
    first_half = make_inference(I0, middle, exp=exp - 1)
    second_half = make_inference(middle, I1, exp=exp - 1)
    return [*first_half, middle, *second_half]


if args.montage:
    left = w // 4
    w = w // 2
if args.UHD:
    ph = ((h - 1) // 64 + 1) * 64
    pw = ((w - 1) // 64 + 1) * 64
else:
    ph = ((h - 1) // 32 + 1) * 32
    pw = ((w - 1) // 32 + 1) * 32
padding = (0, pw - w, 0, ph - h)
pbar = tqdm(total=tot_frame)
skip_frame = 1
if args.montage:
    lastframe = lastframe[:, left:left + w]
write_buffer = Queue(maxsize=500)
read_buffer = Queue(maxsize=500)
_thread.start_new_thread(build_read_buffer, (args, read_buffer, videogen))
_thread.start_new_thread(clear_write_buffer, (args, write_buffer))

I1 = torch.from_numpy(np.transpose(lastframe, (2, 0, 1))).to(
    device, non_blocking=True).unsqueeze(0).float() / 255.
I1 = F.pad(I1, padding)
while True:
    frame = read_buffer.get()
    if frame is None:
        break
def train_stego(*, stegoanalyser: nn.Module,
                train_iterator: DataBatchIterator,
                val_iterator: DataBatchIterator,
                text_iterator: Iterator,
                n_epoch: int, stegoanalyser_opt: Optimizer,
                callbacks: Sequence[Callable] = None, logger: TBLogger,
                encoder: SigmoidTorchEncoder):
    criterion = F.binary_cross_entropy_with_logits
    callbacks = callbacks or []

    for epoch in tqdm(range(n_epoch)):
        stegoanalyser_losses = []
        with train_iterator as iterator:
            for real_batch, _ in iterator:
                batch_size = len(real_batch)
                labels = np.random.choice([0, 1], (batch_size, 1, 1, 1))
                encoded_images = []
                for image, label in zip(real_batch, labels):
                    if label == 1:
                        msg = bytes_to_bits(next(text_iterator))
                        key = generate_random_key(image.shape[1:], len(msg))
                        image = encoder.encode(transform_encoder(image), msg, key)
                        image = inverse_transform_encoder(image)
                    encoded_images.append(image)

                encoded_images = torch.stack(encoded_images)
                labels = torch.from_numpy(labels).float()
                # train stegoanalyzer
                stegoanalyser_opt.zero_grad()
                stegoanalyser_losses.append(
                    process_batch(encoded_images.detach(), labels, stegoanalyser, criterion))
                stegoanalyser_opt.step()

        with val_iterator as iterator:
            accuracy = []
            for real_batch, _ in iterator:
                batch_size = len(real_batch)

                labels = np.random.choice([0, 1], batch_size)
                encoded_images = []
                for image, label in zip(real_batch, labels):
                    if label == 1:
                        msg = bytes_to_bits(next(text_iterator))
                        key = generate_random_key(image.shape[1:], len(msg))
                        image = encoder.encode(transform_encoder(image), msg, key)
                        image = inverse_transform_encoder(image)
                    encoded_images.append(image)

                encoded_images = torch.stack(encoded_images)
                # evaluate stegoanalyzer
                out = inference_step(encoded_images, stegoanalyser).cpu().detach()
                out = torch.sigmoid(out) > 0.5
                out = out.reshape(len(encoded_images)).numpy()
                accuracy_score = sklearn.metrics.accuracy_score(labels, out)
                accuracy.append(accuracy_score)

            mean_accuracy = np.mean(accuracy)
            print(f'validation accuracy score {mean_accuracy}')

            losses = {'Stegoanalyser loss': np.mean(stegoanalyser_losses),
                      'Val accuracy': mean_accuracy}
            logger.policies(losses, epoch)

            # run callbacks
            for callback in callbacks:
                callback(epoch)
Example #43
0
from model_architecture_v_6 import *

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_data = []
val_data = []
test_data = []

with h5py.File('../split_aug_v4.h5', 'r') as gData:

    images = np.array(gData['train']['images'])
    labels = np.array(gData['train']['labels'])
    for i in tqdm(range(len(images))):
        train_data.append(tuple([transform(images[i]), labels[i]]))

    images = np.array(gData['val']['images'])
    labels = np.array(gData['val']['labels'])
    for i in tqdm(range(len(images))):
        val_data.append(tuple([transform(images[i]), labels[i]]))

    images = np.array(gData['test']['images'])
    labels = np.array(gData['test']['labels'])
    for i in tqdm(range(len(images))):
        test_data.append(tuple([transform(images[i]), labels[i]]))

torch.manual_seed(300)
random.shuffle(train_data)
random.shuffle(val_data)
Example #44
0
#rating for method1
r1 = Rating()
#rating for method2
r2 = Rating()
r1_result = np.zeros((ngames, 3))
r2_result = np.zeros((ngames, 3))
start1 = time.time()
boardSize = 3
#two paramters for mcts
cp = 1
N = 100
#time control for idtt
time_limit = 5
idtt_depth = 5
#start play games
for i in tqdm.tqdm(list(range(ngames))):
    #3 experiments using alphabeta with random or dijkstra evulation: method can choose 'random' and 'dijkstra'
    #uncomment following code if choose alphabeta VS alphabeta: random and dijkstra; dijkstra and dijkstra
    # if i%2==0:
    #     #method1 moves first
    #    result=alphabeta_randomVSdijkstra(method1='dijkstra',method2='dijkstra',depth1=3,depth2=4,size=boardSize,print_all=False,first=True)
    # else:
    # #method2 moves first
    #    result=alphabeta_randomVSdijkstra(method1='dijkstra',method2='dijkstra',depth1=3,depth2=4,size=boardSize,print_all=False,first=False)

    #----------------------------------------------------------------------------------------------------------------------------------------------
    # experimnets using idtt and alphabeta
    #uncomment if choose idtt VS alphabeta (meethod can choose 'random' or 'dijkstra')
    # if i%2==0:
    #     #method1 moves first
    #    result=idtt_alphabeta(method='dijkstra',idtt_depth=idtt_depth,depth2=3,size=boardSize,print_all=False,first=True,time_limit=time_limit)
Example #45
0
    def valid(self, dataloader, writer, ep):
        torch.cuda.empty_cache()
        self.model.eval()

        epoch_logs = {
            "gen_latent_loss": [],
            "gen_ref_loss": [],
            "disc_latent_loss": [],
            "disc_ref_loss": [],
            "style_latent_loss": [],
            "style_ref_loss": [],
            "diversity_latent_loss": [],
            "diversity_ref_loss": [],
            "cycle_latent_loss": [],
            "cycle_ref_loss": []
        }

        for indx, data in tqdm(enumerate(dataloader)):

            img, og_domain, x1, x2, domain = data
            img = img.to(self.device)
            x1 = x1.to(self.device)
            x2 = x2.to(self.device)
            z1 = torch.normal(
                torch.tensor([0.5]).repeat(self.batch, self.latent_dim),
                1).to(self.device)
            z2 = torch.normal(
                torch.tensor([0.5]).repeat(self.batch, self.latent_dim),
                1).to(self.device)

            disc_loss, gen_loss, style_loss, diversity_loss, cycle_loss = self.model(
                img, domain, og_domain, z=(z1, z2), train=False)

            disc_loss2, gen_loss2, style_loss2, diversity_loss2, cycle_loss2 = self.model(
                img, domain, og_domain, x=(x1, x2), train=False)

            epoch_logs["gen_latent_loss"].append((gen_loss).item())
            epoch_logs["gen_ref_loss"].append((gen_loss2).item())
            epoch_logs["disc_latent_loss"].append((disc_loss).item())
            epoch_logs["disc_ref_loss"].append((disc_loss2).item())
            epoch_logs["style_latent_loss"].append((style_loss).item())
            epoch_logs["style_ref_loss"].append((style_loss2).item())
            epoch_logs["diversity_latent_loss"].append((diversity_loss).item())
            epoch_logs["diversity_ref_loss"].append((diversity_loss2).item())
            epoch_logs["cycle_latent_loss"].append((cycle_loss).item())
            epoch_logs["cycle_ref_loss"].append((cycle_loss2).item())

            for x, y in epoch_logs.items():
                writer.add_scalar(f"val/{x}", np.array(y[-1:]), ep + indx)

        epoch_logs["gen_latent_loss"] = np.mean(epoch_logs["gen_latent_loss"])
        epoch_logs["gen_ref_loss"] = np.mean(epoch_logs["gen_ref_loss"])
        epoch_logs["disc_latent_loss"] = np.mean(
            epoch_logs["disc_latent_loss"])
        epoch_logs["disc_ref_loss"] = np.mean(epoch_logs["disc_ref_loss"])
        epoch_logs["style_latent_loss"] = np.mean(
            epoch_logs["style_latent_loss"])
        epoch_logs["style_ref_loss"] = np.mean(epoch_logs["style_ref_loss"])
        epoch_logs["diversity_latent_loss"] = np.mean(
            epoch_logs["diversity_latent_loss"])
        epoch_logs["diversity_ref_loss"] = np.mean(
            epoch_logs["diversity_ref_loss"])
        epoch_logs["cycle_latent_loss"] = np.mean(
            epoch_logs["cycle_latent_loss"])
        epoch_logs["cycle_ref_loss"] = np.mean(epoch_logs["cycle_ref_loss"])

        return epoch_logs
Example #46
0
import json
from tqdm import tqdm

df = pd.read_excel('mapping_parsed_final_test_tableau.xlsx')

#df = df.drop(['AUM'], axis =1)
#print(df.head())
file = open('apikey.txt', 'r')

api = file.read()

lat = []
long = []

print('Geocoding locations..................................')
for row in tqdm(range(0, df.shape[0])):
    response = urllib.request.urlopen(
        'https://maps.googleapis.com/maps/api/geocode/json?address=' +
        df.iloc[row]['Area'].replace(' ', '+') + ',+' +
        df.iloc[row]['District'].replace(' ', '+') + ',' +
        df.iloc[row]['Region'].replace(' ', '+') + ',+' + 'Hong+Kong' +
        '&key=' + api)
    geocode_result = json.load(response)
    x = geocode_result['results'][0]['geometry']['location']['lat']
    y = geocode_result['results'][0]['geometry']['location']['lng']
    lat.append(x)
    long.append(y)

print('All coordinates successfully generated via geocoding')

lat = pd.Series(lat)
    def run_experiment(self):
        """
        Runs experiment train and evaluation iterations, saving the model and best val model and val model accuracy after each epoch
        :return: The summary current_epoch_losses from starting epoch to total_epochs.
        """
        total_losses = {
            "train_acc": [],
            "train_loss": [],
            "val_acc": [],
            "val_loss": []
        }  # initialize a dict to keep the per-epoch metrics
        for i, epoch_idx in enumerate(
                range(self.starting_epoch, self.num_epochs)):
            epoch_start_time = time.time()
            current_epoch_losses = {
                "train_acc": [],
                "train_loss": [],
                "val_acc": [],
                "val_loss": []
            }
            self.current_epoch = epoch_idx
            with tqdm.tqdm(
                    total=len(self.train_data)
            ) as pbar_train:  # create a progress bar for training
                for idx, (x,
                          y) in enumerate(self.train_data):  # get data batches
                    loss, accuracy = self.run_train_iter(
                        x=x, y=y)  # take a training iter step
                    current_epoch_losses["train_loss"].append(
                        loss)  # add current iter loss to the train loss list
                    current_epoch_losses["train_acc"].append(
                        accuracy)  # add current iter acc to the train acc list
                    pbar_train.update(1)
                    pbar_train.set_description(
                        "loss: {:.4f}, accuracy: {:.4f}".format(
                            loss, accuracy))

            with tqdm.tqdm(
                    total=len(self.val_data)
            ) as pbar_val:  # create a progress bar for validation
                for x, y in self.val_data:  # get data batches
                    loss, accuracy = self.run_evaluation_iter(
                        x=x, y=y)  # run a validation iter
                    current_epoch_losses["val_loss"].append(
                        loss)  # add current iter loss to val loss list.
                    current_epoch_losses["val_acc"].append(
                        accuracy)  # add current iter acc to val acc lst.
                    pbar_val.update(1)  # add 1 step to the progress bar
                    pbar_val.set_description(
                        "loss: {:.4f}, accuracy: {:.4f}".format(
                            loss, accuracy))
            val_mean_accuracy = np.mean(current_epoch_losses['val_acc'])
            if val_mean_accuracy > self.best_val_model_acc:  # if current epoch's mean val acc is greater than the saved best val acc then
                self.best_val_model_acc = val_mean_accuracy  # set the best val model acc to be current epoch's val accuracy
                self.best_val_model_idx = epoch_idx  # set the experiment-wise best val idx to be the current epoch's idx

            for key, value in current_epoch_losses.items():
                total_losses[key].append(
                    np.mean(value)
                )  # get mean of all metrics of current epoch metrics dict, to get them ready for storage and output on the terminal.

            save_statistics(experiment_log_dir=self.experiment_logs,
                            filename='summary.csv',
                            stats_dict=total_losses,
                            current_epoch=i,
                            continue_from_mode=True if
                            (self.starting_epoch != 0 or i > 0) else
                            False)  # save statistics to stats file.

            # load_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv') # How to load a csv file if you need to

            out_string = "_".join([
                "{}_{:.4f}".format(key, np.mean(value))
                for key, value in current_epoch_losses.items()
            ])
            # create a string to use to report our epoch metrics
            epoch_elapsed_time = time.time(
            ) - epoch_start_time  # calculate time taken for epoch
            epoch_elapsed_time = "{:.4f}".format(epoch_elapsed_time)
            print("Epoch {}:".format(epoch_idx), out_string, "epoch time",
                  epoch_elapsed_time, "seconds")
            self.state['model_epoch'] = epoch_idx
            self.save_model(
                model_save_dir=self.experiment_saved_models,
                # save model and best val idx and best val acc, using the model dir, model name and model idx
                model_save_name="train_model",
                model_idx=epoch_idx,
                best_validation_model_idx=self.best_val_model_idx,
                best_validation_model_acc=self.best_val_model_acc)
            self.save_model(
                model_save_dir=self.experiment_saved_models,
                # save model and best val idx and best val acc, using the model dir, model name and model idx
                model_save_name="train_model",
                model_idx='latest',
                best_validation_model_idx=self.best_val_model_idx,
                best_validation_model_acc=self.best_val_model_acc)

            ################################################################
            ##### Plot Gradient Flow at each Epoch during Training  ######
            print(
                "Generating Gradient Flow Plot at epoch {}".format(epoch_idx))
            plt = self.plot_grad_flow(self.model.named_parameters())
            if not os.path.exists(
                    os.path.join(self.experiment_saved_models,
                                 'gradient_flow_plots')):
                os.mkdir(
                    os.path.join(self.experiment_saved_models,
                                 'gradient_flow_plots'))
                # plt.legend(loc="best")
            plt.savefig(
                os.path.join(self.experiment_saved_models,
                             'gradient_flow_plots',
                             "epoch{}.pdf".format(str(epoch_idx))))
            ################################################################

        print("Generating test set evaluation metrics")
        self.load_model(
            model_save_dir=self.experiment_saved_models,
            model_idx=self.best_val_model_idx,
            # load best validation model
            model_save_name="train_model")
        current_epoch_losses = {
            "test_acc": [],
            "test_loss": []
        }  # initialize a statistics dict
        with tqdm.tqdm(
                total=len(self.test_data)) as pbar_test:  # ini a progress bar
            for x, y in self.test_data:  # sample batch
                loss, accuracy = self.run_evaluation_iter(
                    x=x, y=y
                )  # compute loss and accuracy by running an evaluation step
                current_epoch_losses["test_loss"].append(
                    loss)  # save test loss
                current_epoch_losses["test_acc"].append(
                    accuracy)  # save test accuracy
                pbar_test.update(1)  # update progress bar status
                pbar_test.set_description(
                    "loss: {:.4f}, accuracy: {:.4f}".format(
                        loss, accuracy))  # update progress bar string output

        test_losses = {
            key: [np.mean(value)]
            for key, value in current_epoch_losses.items()
        }  # save test set metrics in dict format
        save_statistics(
            experiment_log_dir=self.experiment_logs,
            filename='test_summary.csv',
            # save test set metrics on disk in .csv format
            stats_dict=test_losses,
            current_epoch=0,
            continue_from_mode=False)

        return total_losses, test_losses
            else:
                print("wrong path or files")
    return picture

wsi_set = get_picture(wsi_path)

csvfile = open('normal_bounding_boxes_in_tumor_wsi.csv', 'w')
fieldnames = ['wsi', 'bounding_boxes', 'patch_index']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

wsioptions = WSIOps()

total_index = 0

for i in tqdm(range(len(wsi_set))):
#for i in tqdm(range(len(wsi_set))):
    wsi_mask, mask_image = wsioptions.read_wsi_mask(base_path + mask_path + (os.path.splitext(wsi_set[i]))[0] + "_Mask.tif")

    wsi_image, rgb_image, _, _, _ = wsioptions.read_wsi_tumor(base_path + wsi_path + wsi_set[i], base_path + mask_path + (os.path.splitext(wsi_set[i]))[0] + "_Mask.tif")

    bounding_boxes, rgb_contour, image_open = wsioptions.find_roi_bbox(rgb_image)
    #bounding_boxes = wsioptions.find_roi_bbox_tumor_gt_mask(mask_image)
    print('%s bianjie' % os.path.splitext(wsi_set[i])[0], bounding_boxes) 
#    writer.writerow({'wsi':os.path.splitext(wsi_set[i])[0], 'bounding_boxes':bounding_boxes})
#    print('saved successfully!!')
    level_used = wsi_mask.level_count - 1
    patchex = PatchExtractor()
    patch_index = patchex.extract_negative_patches_from_tumor_wsi(wsi_image, mask_image, image_open, level_used, bounding_boxes, patch_save_dir='normal_patches/', patch_prefix=(os.path.splitext(wsi_set[i]))[0] + '_', patch_index=0)
    print('last numbers:', total_index)
    print('new generate patches:', patch_index)
def simple_use_plan_06(kwargs) -> AllocationPlan:
    atcs = kwargs["atcs"]
    devices = kwargs["ds"]
    max_hop = kwargs["max_hop"]
    congestion_scope = kwargs["congestion_scope"]

    t_len = len(atcs)
    y_len = len(atcs[0])
    x_len = len(atcs[0][0])
    allocation_plan = create_blank_allocation_plan(atcs, devices)
    for time, cloudlets in enumerate(tqdm(atcs)):
        ds = list(filter(lambda d: d.is_poweron(time), devices))
        reqapp = ["1","2","3"]
        # reqapp ごとにデバイスを分ける
        dsa1 = list(filter(lambda d: d.appret(reqapp[0]), ds))
        dsa2 = list(filter(lambda d: d.appret(reqapp[1]), ds))
        dsa3 = list(filter(lambda d: d.appret(reqapp[2]), ds))
        # 要求数が多い順番を特定
        ds_app_len = [
            len(dsa1),
            len(dsa2),
            len(dsa3)
        ]
        max_len = [0,0,0]
        for i in range(0,3):
            max_len[i] = str(np.argmax(ds_app_len))
            ds_app_len[int(max_len[i])] = 0

        # app ごとの混雑度を計測
        congestion_mapa1 = simple_create_congestion_map(time, x_len, y_len, dsa1, congestion_scope)
        congestion_mapa2 = simple_create_congestion_map(time, x_len, y_len, dsa2, congestion_scope)
        congestion_mapa3 = simple_create_congestion_map(time, x_len, y_len, dsa3, congestion_scope)
        # app ごとの混雑度と近傍の利用可能cloudletの
        set_app_pri(atcs, congestion_mapa1, time, dsa1, x_len, y_len, max_len, congestion_scope)
        set_app_pri(atcs, congestion_mapa2, time, dsa2, x_len, y_len, max_len, congestion_scope)
        set_app_pri(atcs, congestion_mapa3, time, dsa3, x_len, y_len, max_len, congestion_scope)

        ds = sorted(ds, key=lambda d: d.ds_pri, reverse=False)

        for d in ds:
            # step1 前回と同じ場所に置くことが適切かどうか判定し、適切なら配置を試行する
            now_pos = d.get_pos(time)
            if d.startup_time != time:
                prev_pos = d.get_allocation_point(time - 1)
                if distance(prev_pos, now_pos) <= max_hop:
                    # 前回と同じ場所に置くことを試行する
                    if cloudlets[prev_pos.y][prev_pos.x].can_append_device(d, True):
                        allocate(d, time, prev_pos, allocation_plan, cloudlets)
                        continue
            # step2
            if d.is_poweron(time + max_hop):
                next_pos = d.get_pos(time + max_hop)
            else:
                next_pos = d.get_pos(d.shutdown_time - 1)
            for hop in range(max_hop, 30):
                nps = near_points(now_pos, hop, Point(x_len - 1, y_len - 1), Point(0, 0))
                nps = sorted(nps, key=lambda p: distance(p, next_pos))
                tp, index = search(nps, True, key=lambda p: cloudlets[p.y][p.x].can_append_device(d, True))
                if index == -1:
                    continue
                allocate(d, time, tp, allocation_plan, cloudlets)
                break
            else:
                # どこにも割当られなかった場合
                allocation = Allocation(now_pos.x, now_pos.y, -1)
                allocation_plan[d.name][time] = allocation
                d.set_allocation_point(time, allocation)
                print("allocation failed", d.name, time)
    return allocation_plan
Example #50
0
    def train(self, dataloader, writer, ep):
        torch.cuda.empty_cache()
        self.model.train()
        epoch_logs = {
            "gen_latent_loss": [],
            "gen_ref_loss": [],
            "disc_latent_loss": [],
            "disc_ref_loss": [],
            "style_latent_loss": [],
            "style_ref_loss": [],
            "diversity_latent_loss": [],
            "diversity_ref_loss": [],
            "cycle_latent_loss": [],
            "cycle_ref_loss": []
        }

        for indx, data in tqdm(enumerate(dataloader)):

            img, og_domain, x1, x2, domain = data
            img = img.to(self.device)
            x1 = x1
            x2 = x2
            z1 = torch.normal(
                torch.tensor([0.5]).repeat(self.batch, self.latent_dim), 1)
            z2 = torch.normal(
                torch.tensor([0.5]).repeat(self.batch, self.latent_dim), 1)

            disc_loss, gen_loss, style_loss, diversity_loss, cycle_loss = self.model(
                img, domain, og_domain, z=(z1, z2))

            self.model.dsc_optim.zero_grad()
            disc_loss.backward(retain_graph=True)
            self.model.map_optim.zero_grad()
            self.model.style_optim.zero_grad()
            self.model.gen_optim.zero_grad()
            gen_loss.backward()
            self.model.dsc_optim.step()
            self.model.gen_optim.step()
            self.model.map_optim.step()

            self.model.style_optim.step()

            disc_loss2, gen_loss2, style_loss2, diversity_loss2, cycle_loss2 = self.model(
                img, domain, og_domain, x=(x1, x2))

            self.model.dsc_optim.zero_grad()
            disc_loss2.backward(retain_graph=True)
            self.model.gen_optim.zero_grad()
            gen_loss2.backward()
            self.model.dsc_optim.step()

            self.model.gen_optim.step()

            epoch_logs["gen_latent_loss"].append((gen_loss).item())
            epoch_logs["gen_ref_loss"].append((gen_loss2).item())
            epoch_logs["disc_latent_loss"].append((disc_loss).item())
            epoch_logs["disc_ref_loss"].append((disc_loss2).item())
            epoch_logs["style_latent_loss"].append((style_loss).item())
            epoch_logs["style_ref_loss"].append((style_loss2).item())
            epoch_logs["diversity_latent_loss"].append((diversity_loss).item())
            epoch_logs["diversity_ref_loss"].append((diversity_loss2).item())
            epoch_logs["cycle_latent_loss"].append((cycle_loss).item())
            epoch_logs["cycle_ref_loss"].append((cycle_loss2).item())

            for x, y in epoch_logs.items():

                writer.add_scalar(f"train/{x}", np.array(y[-1:]), ep + indx)

        epoch_logs["gen_latent_loss"] = np.mean(epoch_logs["gen_latent_loss"])
        epoch_logs["gen_ref_loss"] = np.mean(epoch_logs["gen_ref_loss"])
        epoch_logs["disc_latent_loss"] = np.mean(
            epoch_logs["disc_latent_loss"])
        epoch_logs["disc_ref_loss"] = np.mean(epoch_logs["disc_ref_loss"])
        epoch_logs["style_latent_loss"] = np.mean(
            epoch_logs["style_latent_loss"])
        epoch_logs["style_ref_loss"] = np.mean(epoch_logs["style_ref_loss"])
        epoch_logs["diversity_latent_loss"] = np.mean(
            epoch_logs["diversity_latent_loss"])
        epoch_logs["diversity_ref_loss"] = np.mean(
            epoch_logs["diversity_ref_loss"])
        epoch_logs["cycle_latent_loss"] = np.mean(
            epoch_logs["cycle_latent_loss"])
        epoch_logs["cycle_ref_loss"] = np.mean(epoch_logs["cycle_ref_loss"])

        return epoch_logs
Example #51
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    
    parser.add_argument('--text_a', type=str, default='', help="input text_a.")
    parser.add_argument('--text_b', type=str, default='', help="input text_b.")
    
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

    num_labels_task = {
        "cola": 2,
        "mnli": 3,
        "mrpc": 2,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
              cache_dir=cache_dir,
              num_labels = num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        model = BertForSequenceClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))
    else:
        #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
        # Load a trained model and config that you have fine-tuned
        print('for eval only......................')
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        config = BertConfig(output_config_file)
        model = BertForSequenceClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))
    model.to(device)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        #eval_examples = processor.get_dev_examples(args.data_dir)
        #eval_examples = {'text_a':"He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .",'text_b':"The foodservice pie business does not fit our long-term growth strategy .",'label':'1','guid':'12345'}
        eval_examples = {'text_a':args.text_a,'text_b':args.text_b,'label':'1','guid':'1234'}
        print(eval_examples)
        #import pdb;pdb.set_trace()
        eval_features = convert_examples_to_features_pred(eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
 
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            # convert logits to probability
            logits_prob = np.exp(logits)/(1+np.exp(logits))
            print("================================")
            print("label is : {}".format(np.argmax(logits,axis=1)))
            print("confidence score : {}".format(np.max(logits_prob,axis=1)))
            print("================================")
            sys.exit(1)
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
                  'loss': loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    cut_off = 0
    parser = ArgumentParser()
    parser.add_argument(
        "--scrape-file-loc",
        help="location of scrape file to be sentence splitted",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--output-folder", help="location of output folder", type=str, required=True
    )
    parser.add_argument(
        "--lang",
        help="language code :\
                                        'Kannada':'kn',\
                                        'Tamil':'ta',\
                                        'Marathi':'mr',\
                                        'Telugu':'te',\
                                        'Bengali':'bn',\
                                        'Gujarati':'gu',\
                                        'Malayalam':'ml',\
                                        'Punjabi':'pa',\
                                        'Assamese':'asm',\
                                        'Odia':'or',\
                                        'Urdu':'ur'",
        type=str,
        required=True,
    )
    args = parser.parse_args()
    lang = args.lang
    look_up_dict = {
        "English": "en",
        "Hindi": "hi",
        "Kannada": "kn",
        "Tamil": "ta",
        "Marathi": "mr",
        "Telugu": "te",
        "Bengali": "bn",
        "Gujarati": "gu",
        "Malayalam": "ml",
        "Punjabi": "pa",
        "Assamese": "asm",
        "Odia": "or",
        "Urdu": "ur",
    }
    # Note: Inverting above dictionery
    look_up_dict = {v: k for k, v in look_up_dict.items()}
    if lang in look_up_dict.keys():
        scrape_loc = args.scrape_file_loc
        csv_file_loc = (
            args.output_folder
        )
        # list_fl = '_'.join([look_up_dict[lang],n_month,n_year]) + '.csv'
        tokenize_loc = csv_file_loc + "//" + "tokenize_file_" + os.path.basename(os.path.normpath(scrape_loc))
        # submit_aligner = csv_file_loc + '\\' + 'submit_aligner'
        if not os.path.exists(scrape_loc):
            print(f"Path dosent exists:{scrape_loc}")
            return
        create_directory(csv_file_loc)
        create_directory(tokenize_loc)

    else:
        print("Please enter the corrent langauge code")
        return
    total_sen_pd = pd.DataFrame(columns=[look_up_dict[lang] + "_sen"])
    fl_list = sorted(glob.glob(os.path.join(scrape_loc, "*.txt")))
    fl_list_rename = [
        os.path.join(
            scrape_loc,
            "_".join(
                [
                    os.path.basename(i).split(".")[0].split()[0].zfill(5),
                    *os.path.basename(i).split(".")[0].split()[1:6],
                ]
            ),
        )
        + "."
        + os.path.basename(i).split(".")[-1]
        for i in fl_list
    ]
    for org, chg in zip(fl_list, fl_list_rename):
        os.rename(org, chg)
    fl_list = sorted(glob.glob(os.path.join(scrape_loc, "*.txt")))
    old_count = 0
    for k, fl in tqdm(enumerate(fl_list), total = len(fl_list)):
        if k < cut_off: continue
        # print(os.path.basename(fl))
        # Read Scrape Content
        tok_flname = tokenize_loc + "//tok_" + os.path.basename(fl)
        with open(fl, mode="r", encoding="utf-16") as file_r:
            content = file_r.read()
        #             print(content)
        # Cleaning scrape content
        paragraph = content.split("\n")
        content = []
        for para in paragraph:
            para = para.strip()
            para = " ".join(para.split())
            if len(para.split()) >= 4:
                if lang == "en":
                    content.append(para)
                else:
                    try:
                        if detect(para) != "en":
                            content.append(para)
                    except:
                        content.append(para)
        # Tokenizing paragraphs into sentences
        sentences = []
        if lang != 'en':
            for entry in content:
                [sentences.append(tok_sen) for tok_sen in sentence_split(entry, lang)]
        else:
            for entry in content:
                [sentences.append(tok_sen) for tok_sen in sent_tokenize(entry)]
        # Removing Duplicates
        dump_1 = (
            pd.DataFrame(sentences, columns=["sen"])
            .drop_duplicates()
            .loc[:, "sen"]
            .values.tolist()
        )
        sentences = dump_1
        # Write sentence token
        with open(tok_flname, mode="w", encoding="utf-16") as file_w:
            for sen in sentences:
                sen = sen.strip()
                sen = sen.strip('"')
                if len(sen.split()) >= 4:
                    file_w.write(sen + "\n")
                    total_sen_pd = total_sen_pd.append(
                        {look_up_dict[lang] + "_sen": sen.strip()}, ignore_index=True
                    )
        # print(f'Number of sentences found: {total_sen_pd.shape[0]-old_count}')
        old_count = total_sen_pd.shape[0]

    print(f"Total number of sentences found: {total_sen_pd.shape[0]}")
    total_sen_pd.drop_duplicates(inplace=True)
    print(
        f"Total number of sentences after removing duplicate: {total_sen_pd.shape[0]}"
    )
    sys.stdout.flush()
    total_sen_pd.to_csv(
        csv_file_loc
        + "//"
        + "total_"
        + lang
        + "_sen_"
        + os.path.basename(os.path.normpath(scrape_loc))
        + ".csv",
        index=False,
        encoding="utf-16",
    )
    with open(
        csv_file_loc
        + "//"
        + "total_"
        + lang
        + "_sen_"
        + os.path.basename(os.path.normpath(scrape_loc))
        + ".txt",
        mode="w",
        encoding="utf-16",
    ) as write_total:
        for line in total_sen_pd[look_up_dict[lang] + "_sen"].values.tolist():
            write_total.write(line.strip() + "\n")
Example #53
0
        template = cv2.imread(template_path, cv2.IMREAD_UNCHANGED).astype(
            np.float32) / 255.0
        template_mask = get_mask_from_image(template)
        category = os.path.splitext(template_name)[0]
        templates.append((template, template_mask, category))

    MAX_PROCESS = args.max_process
    processes = []
    load_path = args.random_data
    binary_annotation_lines_queue = mp.Queue()
    multiclass_annotation_lines_queue = mp.Queue()
    binary_annotation_lines = []
    multiclass_annotation_lines = []
    nb_imgs_generated = 0
    t0_temp = time.time()
    pbar = tqdm(total=args.total_images)
    for idx, img_name in enumerate(img_names):
        p = mp.Process(target=generate_sample, args=((
            targets_path, img_name, templates, probabilities_vector, positions_list, images_out_path, idx,
            binary_annotation_lines_queue, multiclass_annotation_lines_queue, data_out_path, load_path)))
        p.daemon = True
        p.start()
        processes.append(p)
        if len(processes) == MAX_PROCESS:
            for p in processes:
                p.join()
                pbar.update(1)
                binary_annotation_lines += binary_annotation_lines_queue.get()
                multiclass_annotation_lines += multiclass_annotation_lines_queue.get()
                nb_imgs_generated += 1
def simple_use_plan_02(kwargs) -> AllocationPlan:
    atcs = kwargs["atcs"]
    devices = kwargs["ds"]
    max_hop = kwargs["max_hop"]
    congestion_scope = kwargs["congestion_scope"]
    reqapp=["1", "2", "3"]
    t_len = len(atcs)
    y_len = len(atcs[0])
    x_len = len(atcs[0][0])
    # cloudlets の空計画の作成
    allocation_plan = create_blank_allocation_plan(atcs, devices)
    for time, cloudlets in enumerate(tqdm(atcs)):
        # ds 終了していないデバイスを集める
        ds = list(filter(lambda d: d.is_poweron(time), devices))
        #ds = sorted(ds,lambda d: d.app_name,reversed=True)
        dsa1 = list(filter(lambda d: d.appret(reqapp[0]), ds))
        dsa2 = list(filter(lambda d: d.appret(reqapp[1]), ds))
        dsa3 = list(filter(lambda d: d.appret(reqapp[2]), ds))
        a1_len = len(dsa1)
        a2_len = len(dsa2)
        a3_len = len(dsa3)
        # # 混雑度マップの作成
        congestion_mapa1 = simple_create_congestion_map(time, x_len, y_len, dsa1, congestion_scope)
        congestion_mapa2 = simple_create_congestion_map(time, x_len, y_len, dsa2, congestion_scope)
        congestion_mapa3 = simple_create_congestion_map(time, x_len, y_len, dsa3, congestion_scope)
        # # # # print_congestion(congestion_map, x_len, y_len)
        dsa1 = sorted(dsa1, key=lambda d: congestion_mapa1[d.get_pos(time).y][d.get_pos(time).x], reverse=True)
        dsa2 = sorted(dsa2, key=lambda d: congestion_mapa2[d.get_pos(time).y][d.get_pos(time).x], reverse=True)
        dsa3 = sorted(dsa3, key=lambda d: congestion_mapa3[d.get_pos(time).y][d.get_pos(time).x], reverse=True)
        # if(a1_len > a2_len):
        #     if(a1_len > a3_len):
        #         if(a2_len > a3_len):
        #             ds = dsa2 + dsa1 + dsa3
        #         else:
        #             ds = dsa3 + dsa1 + dsa2
        #     else:
        #         ds = dsa1 + dsa3 + dsa2
        # else:
        #     if(a1_len > a3_len):
        #         ds = dsa1 + dsa2 + dsa3
        #     else:
        #         if(a2_len > a3_len):
        #             ds = dsa2 + dsa3 + dsa1
        #         else:
        #             ds = dsa3 + dsa2 + dsa1

        #ds = dsa2 + dsa1 + dsa3
        ds = dsa1 + dsa3 + dsa2
        # congestion_map = simple_create_congestion_map(time, x_len, y_len, ds, congestion_scope)
        # ds_high = list(filter(lambda d: congestion_map[d.get_pos(time).y][d.get_pos(time).x] > 3), ds)
        # ds_low = list(filter(lambda d: congestion_map[d.get_pos(time).y][d.get_pos(time).x] < 3), ds)
        # ds = ds_high + ds_low

        for d in ds:
            # step1 前回と同じ場所に置くことが適切かどうか判定し、適切なら配置を試行する
            now_pos = d.get_pos(time)
            if d.startup_time != time:
                prev_pos = d.get_allocation_point(time - 1)
                if distance(prev_pos, now_pos) <= max_hop:
                    # 前回と同じ場所に置くことを試行する
                    if cloudlets[prev_pos.y][prev_pos.x].can_append_device(d, True):
                        allocate(d, time, prev_pos, allocation_plan, cloudlets)
                        continue
            # step2 
            if d.is_poweron(time + max_hop):
                next_pos = d.get_pos(time + max_hop)
            else:
                next_pos = d.get_pos(d.shutdown_time - 1)
            for hop in range(max_hop, 30):
                nps = near_points(now_pos, hop, Point(x_len - 1, y_len - 1), Point(0, 0))
                nps = sorted(nps, key=lambda p: distance(p, next_pos))
                tp, index = search(nps, True, key=lambda p: cloudlets[p.y][p.x].can_append_device(d, True))
                if index == -1:
                    continue
                allocate(d, time, tp, allocation_plan, cloudlets)
                break
            else:
                # どこにも割当られなかった場合
                allocation = Allocation(now_pos.x, now_pos.y, -1)
                allocation_plan[d.name][time] = allocation
                d.set_allocation_point(time, allocation)
                print("allocation failed", d.name, time)

    return allocation_plan
Example #55
0
def kmean_anchors(path='./data/coco128.yaml',
                  n=9,
                  img_size=640,
                  thr=4.0,
                  gen=1000,
                  verbose=True):
    """ Creates kmeans-evolved anchors from training dataset

        Arguments:
            path: path to dataset *.yaml, or a loaded dataset
            n: number of anchors
            img_size: image size used for training
            thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
            gen: generations to evolve anchors using genetic algorithm
            verbose: print all results

        Return:
            k: kmeans evolved anchors

        Usage:
            from utils.autoanchor import *; _ = kmean_anchors()
    """
    thr = 1. / thr
    prefix = colorstr('autoanchor: ')

    def metric(k, wh):  # compute metrics
        r = wh[:, None] / k[None]
        x = torch.min(r, 1. / r).min(2)[0]  # ratio metric
        # x = wh_iou(wh, torch.tensor(k))  # iou metric
        return x, x.max(1)[0]  # x, best_x

    def anchor_fitness(k):  # mutation fitness
        _, best = metric(torch.tensor(k, dtype=torch.float32), wh)
        return (best * (best > thr).float()).mean()  # fitness

    def print_results(k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        x, best = metric(k, wh0)
        bpr, aat = (best > thr).float().mean(), (
            x > thr).float().mean() * n  # best possible recall, anch > thr
        print(
            f'{prefix}thr={thr:.2f}: {bpr:.4f} best possible recall, {aat:.2f} anchors past thr'
        )
        print(
            f'{prefix}n={n}, img_size={img_size}, metric_all={x.mean():.3f}/{best.mean():.3f}-mean/best, '
            f'past_thr={x[x > thr].mean():.3f}-mean: ',
            end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])),
                  end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    if isinstance(path, str):  # *.yaml file
        with open(path) as f:
            data_dict = yaml.load(f, Loader=yaml.SafeLoader)  # model dict
        from utils.datasets import LoadImagesAndLabels
        dataset = LoadImagesAndLabels(data_dict['train'],
                                      augment=True,
                                      rect=True)
    else:
        dataset = path  # dataset

    # Get label wh
    shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True)
    wh0 = np.concatenate(
        [l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])  # wh

    # Filter
    i = (wh0 < 3.0).any(1).sum()
    if i:
        print(
            f'{prefix}WARNING: Extremely small objects found. {i} of {len(wh0)} labels are < 3 pixels in size.'
        )
    wh = wh0[(wh0 >= 2.0).any(1)]  # filter > 2 pixels
    # wh = wh * (np.random.rand(wh.shape[0], 1) * 0.9 + 0.1)  # multiply by random scale 0-1

    # Kmeans calculation
    print(f'{prefix}Running kmeans for {n} anchors on {len(wh)} points...')
    s = wh.std(0)  # sigmas for whitening
    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    k *= s
    wh = torch.tensor(wh, dtype=torch.float32)  # filtered
    wh0 = torch.tensor(wh0, dtype=torch.float32)  # unfiltered
    k = print_results(k)

    # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7), tight_layout=True)
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))  # plot wh
    # ax[0].hist(wh[wh[:, 0]<100, 0],400)
    # ax[1].hist(wh[wh[:, 1]<100, 1],400)
    # fig.savefig('wh.png', dpi=200)

    # Evolve
    npr = np.random
    f, sh, mp, s = anchor_fitness(
        k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    pbar = tqdm(range(gen),
                desc=f'{prefix}Evolving anchors with Genetic Algorithm:'
                )  # progress bar
    for _ in pbar:
        v = np.ones(sh)
        while (v == 1
               ).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s +
                 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = anchor_fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            pbar.desc = f'{prefix}Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'
            if verbose:
                print_results(k)

    return print_results(k)
Example #56
0
hdf5_path = "/home/mil/gupta/ifood18/data/h5data/test_data.h5py"
# open a hdf5 file and create earrays
hdf5_file = h5py.File(hdf5_path, mode='w')

hdf5_file.create_dataset("data", test_shape, np.float32)
hdf5_file.create_dataset("mean", test_shape[1:], np.float32)
#hdf5_file.create_dataset("labels", (len(test_data),), np.int32)
#hdf5_file["labels"][...] = val_labels



from tqdm import tqdm
# a numpy array to save the mean of the images
mean = np.zeros(test_shape[1:], np.float32)
# loop over train addresses
for i in tqdm(range(len(test_data))):			   
	addr = os.path.join(test_data_path,test_data[i])
	#print("image addres is :",addr)
	img = cv2.imread(addr)
	#print(img)
	img = cv2.resize(img, (image_size, image_size), interpolation=cv2.INTER_CUBIC)
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	


	#current_image = ia.imresize_single_image(addr, (256, 256))
	image_aug = img.transpose(2,0,1)
	hdf5_file["data"][i, ...] = image_aug[None]
	mean += image_aug / float(len(test_data))

	
def undistort_points(config, dataframe, camera_pair, destfolder):
    cfg_3d = auxiliaryfunctions.read_config(config)
    img_path, path_corners, path_camera_matrix, path_undistort = auxiliaryfunctions_3d.Foldernames3Dproject(
        cfg_3d)
    '''
    path_undistort = destfolder
    filename_cam1 = Path(dataframe[0]).stem
    filename_cam2 = Path(dataframe[1]).stem

    #currently no interm. saving of this due to high speed.
    # check if the undistorted files are already present
    if os.path.exists(os.path.join(path_undistort,filename_cam1 + '_undistort.h5')) and os.path.exists(os.path.join(path_undistort,filename_cam2 + '_undistort.h5')):
        print("The undistorted files are already present at %s" % os.path.join(path_undistort,filename_cam1))
        dataFrame_cam1_undistort = pd.read_hdf(os.path.join(path_undistort,filename_cam1 + '_undistort.h5'))
        dataFrame_cam2_undistort = pd.read_hdf(os.path.join(path_undistort,filename_cam2 + '_undistort.h5'))
    else:
    '''
    if True:
        # Create an empty dataFrame to store the undistorted 2d coordinates and likelihood
        dataframe_cam1 = pd.read_hdf(dataframe[0])
        dataframe_cam2 = pd.read_hdf(dataframe[1])
        scorer_cam1 = dataframe_cam1.columns.get_level_values(0)[0]
        scorer_cam2 = dataframe_cam2.columns.get_level_values(0)[0]
        stereo_file = auxiliaryfunctions.read_pickle(
            os.path.join(path_camera_matrix, 'stereo_params.pickle'))
        path_stereo_file = os.path.join(path_camera_matrix,
                                        'stereo_params.pickle')
        stereo_file = auxiliaryfunctions.read_pickle(path_stereo_file)
        mtx_l = stereo_file[camera_pair]['cameraMatrix1']
        dist_l = stereo_file[camera_pair]['distCoeffs1']

        mtx_r = stereo_file[camera_pair]['cameraMatrix2']
        dist_r = stereo_file[camera_pair]['distCoeffs2']

        R1 = stereo_file[camera_pair]['R1']
        P1 = stereo_file[camera_pair]['P1']

        R2 = stereo_file[camera_pair]['R2']
        P2 = stereo_file[camera_pair]['P2']

        # Create an empty dataFrame to store the undistorted 2d coordinates and likelihood
        dataFrame_cam1_undistort, scorer_cam1, bodyparts = auxiliaryfunctions_3d.create_empty_df(
            dataframe_cam1, scorer_cam1, flag='2d')
        dataFrame_cam2_undistort, scorer_cam2, bodyparts = auxiliaryfunctions_3d.create_empty_df(
            dataframe_cam2, scorer_cam2, flag='2d')

        for bpindex, bp in tqdm(enumerate(bodyparts)):
            # Undistorting the points from cam1 camera
            points_cam1 = np.array([
                dataframe_cam1[scorer_cam1][bp]['x'].values[:],
                dataframe_cam1[scorer_cam1][bp]['y'].values[:]
            ])
            points_cam1 = points_cam1.T
            points_cam1 = np.expand_dims(points_cam1, axis=1)
            points_cam1_remapped = cv2.undistortPoints(src=points_cam1,
                                                       cameraMatrix=mtx_l,
                                                       distCoeffs=dist_l,
                                                       P=P1,
                                                       R=R1)

            dataFrame_cam1_undistort.iloc[:][scorer_cam1, bp,
                                             'x'] = points_cam1_remapped[:, 0,
                                                                         0]
            dataFrame_cam1_undistort.iloc[:][scorer_cam1, bp,
                                             'y'] = points_cam1_remapped[:, 0,
                                                                         1]
            dataFrame_cam1_undistort.iloc[:][
                scorer_cam1, bp, 'likelihood'] = dataframe_cam1[scorer_cam1][
                    bp]['likelihood'].values[:]

            # Undistorting the points from cam2 camera
            points_cam2 = np.array([
                dataframe_cam2[scorer_cam2][bp]['x'].values[:],
                dataframe_cam2[scorer_cam2][bp]['y'].values[:]
            ])
            points_cam2 = points_cam2.T
            points_cam2 = np.expand_dims(points_cam2, axis=1)
            points_cam2_remapped = cv2.undistortPoints(src=points_cam2,
                                                       cameraMatrix=mtx_r,
                                                       distCoeffs=dist_r,
                                                       P=P2,
                                                       R=R2)

            dataFrame_cam2_undistort.iloc[:][scorer_cam2, bp,
                                             'x'] = points_cam2_remapped[:, 0,
                                                                         0]
            dataFrame_cam2_undistort.iloc[:][scorer_cam2, bp,
                                             'y'] = points_cam2_remapped[:, 0,
                                                                         1]
            dataFrame_cam2_undistort.iloc[:][
                scorer_cam2, bp, 'likelihood'] = dataframe_cam2[scorer_cam2][
                    bp]['likelihood'].values[:]

        # Save the undistorted files
        dataFrame_cam1_undistort.sort_index(inplace=True)
        dataFrame_cam2_undistort.sort_index(inplace=True)

    return (dataFrame_cam1_undistort, dataFrame_cam2_undistort,
            stereo_file[camera_pair], path_stereo_file)
Example #58
0
def distance_matrix(s, max_dist=None, max_length_diff=None,
                    window=None, max_step=None, penalty=None, psi=None,
                    block=None, parallel=False,
                    use_c=False, use_nogil=False, show_progress=False):
    """Distance matrix for all sequences in s.

    :param s: Iterable of series
    :param window: see :meth:`distance`
    :param max_dist: see :meth:`distance`
    :param max_step: see :meth:`distance`
    :param max_length_diff: see :meth:`distance`
    :param penalty: see :meth:`distance`
    :param psi: see :meth:`distance`
    :param block: Only compute block in matrix. Expects tuple with begin and end, e.g. ((0,10),(20,25)) will
        only compare rows 0:10 with rows 20:25.
    :param parallel: Use parallel operations
    :param use_c: Use c compiled Python functions (it is recommended to use use_nogil)
    :param use_nogil: Use pure c functions
    :param show_progress: Show progress using the tqdm library
    """
    if parallel and (not use_c or not use_nogil):
        try:
            import multiprocessing as mp
            logger.info('Using multiprocessing')
        except ImportError:
            parallel = False
            mp = None
    else:
        mp = None
    dist_opts = {
        'max_dist': max_dist,
        'max_step': max_step,
        'window': window,
        'max_length_diff': max_length_diff,
        'penalty': penalty,
        'psi': psi
    }
    s = SeriesContainer.wrap(s)
    dists = None
    if max_length_diff is None:
        max_length_diff = np.inf
    large_value = np.inf
    logger.info('Computing distances')
    if use_c:
        for k, v in dist_opts.items():
            if v is None:
                dist_opts[k] = 0.0
    if use_c and use_nogil:
        logger.info("Compute distances in pure C")
        dist_opts['block'] = block
        if parallel:
            logger.info("Use parallel computation")
            dists = dtw_c.distance_matrix_nogil_p(s, **dist_opts)
        else:
            logger.info("Use serial computation")
            dists = dtw_c.distance_matrix_nogil(s, **dist_opts)
    if use_c and not use_nogil:
        logger.info("Compute distances in Python compiled C")
        if parallel:
            logger.info("Use parallel computation")
            dists = np.zeros((len(s), len(s))) + large_value
            if block is None:
                idxs = np.triu_indices(len(s), k=1)
            else:
                idxsl_r = []
                idxsl_c = []
                for r in range(block[0][0], block[0][1]):
                    for c in range(max(r + 1, block[1][0]), min(len(s), block[1][1])):
                        idxsl_r.append(r)
                        idxsl_c.append(c)
                idxs = (np.array(idxsl_r), np.array(idxsl_c))
            with mp.Pool() as p:
                dists[idxs] = p.map(_distance_c_with_params, [(s[r], s[c], dist_opts) for c, r in zip(*idxs)])
                # pbar = tqdm(total=int((len(s)*(len(s)-1)/2)))
                # for r in range(len(s)):
                #     dists[r,r+1:len(s)] = p.map(distance, [(s[r],s[c], dist_opts) for c in range(r+1,len(cur))])
                #     pbar.update(len(s) - r - 1)
                # pbar.close()
        else:
            logger.info("Use serial computation")
            dist_opts['block'] = block
            dists = dtw_c.distance_matrix(s, **dist_opts)
    if not use_c:
        logger.info("Compute distances in Python")
        if parallel:
            logger.info("Use parallel computation")
            dists = np.zeros((len(s), len(s))) + large_value
            if block is None:
                idxs = np.triu_indices(len(s), k=1)
            else:
                idxsl_r = []
                idxsl_c = []
                for r in range(block[0][0], block[0][1]):
                    for c in range(max(r + 1, block[1][0]), min(len(s), block[1][1])):
                        idxsl_r.append(r)
                        idxsl_c.append(c)
                idxs = (np.array(idxsl_r), np.array(idxsl_c))
            with mp.Pool() as p:
                dists[idxs] = p.map(_distance_with_params, [(s[r], s[c], dist_opts) for c, r in zip(*idxs)])
                # pbar = tqdm(total=int((len(s)*(len(s)-1)/2)))
                # for r in range(len(s)):
                #     dists[r,r+1:len(s)] = p.map(distance, [(s[r],s[c], dist_opts) for c in range(r+1,len(cur))])
                #     pbar.update(len(s) - r - 1)
                # pbar.close()
        else:
            logger.info("Use serial computation")
            dists = np.zeros((len(s), len(s))) + large_value
            if block is None:
                it_r = range(len(s))
            else:
                it_r = range(block[0][0], block[0][1])
            if show_progress:
                it_r = tqdm(it_r)
            for r in it_r:
                if block is None:
                    it_c = range(r + 1, len(s))
                else:
                    it_c = range(max(r + 1, block[1][0]), min(len(s), block[1][1]))
                for c in it_c:
                    if abs(len(s[r]) - len(s[c])) <= max_length_diff:
                        dists[r, c] = distance(s[r], s[c], **dist_opts)
    return dists
Example #59
0
def main(region, rawregion, data, parameter, npts):
    # region
    minlon, maxlon, minlat, maxlat, mindep, maxdep = [
        float(item) for item in region.split("/")
    ]
    rawminlon, rawmaxlon, rawminlat, rawmaxlat, rawmindep, rawmaxdep = [
        float(item) for item in rawregion.split("/")
    ]
    if (rawminlon > rawmaxlon):
        # interpolation: The points in dimension 0 must be strictly ascending
        rawminlon, rawmaxlon = rawmaxlon, rawminlon
        rawminlat, rawmaxlat = rawmaxlat, rawminlat
    # data
    data = np.load(data)
    # latnpts and lonnpts should be the same if plot vertically
    lonnpts, latnpts, depnpts = [int(item) for item in npts.split("/")]

    plot_vertically = True
    if (mindep == maxdep):
        plot_vertically = False

    hnpts, vnpts = None, None
    if (plot_vertically):
        if (lonnpts != latnpts):
            raise Exception(
                "latnpts and lonnpts should be the same when plotting vertically"
            )
        hnpts = latnpts
        vnpts = depnpts
        print("plot vertically")
    else:
        hnpts = lonnpts
        vnpts = latnpts
        print("plot horizontally")

    print("preparing mesh:")
    lon_list, lat_list, dep_list = prepare_mesh(data, rawminlon, rawmaxlon,
                                                rawminlat, rawmaxlat,
                                                rawmindep, rawmaxdep)

    # get mesh to plot
    print("interp values:")
    lat_mesh, lon_mesh, dep_mesh = None, None, None
    plot_values = np.zeros((hnpts, vnpts))
    array_to_interpolate = np.zeros((hnpts, vnpts, 3))
    if (plot_vertically):
        lat_mesh = np.linspace(minlat, maxlat, hnpts)
        lon_mesh = np.linspace(minlon, maxlon, hnpts)
        dep_mesh = np.linspace(mindep, maxdep, vnpts)
        for ih in tqdm.tqdm(range(hnpts)):
            for iv in range(vnpts):
                # plot_values[ih, iv] = interp_value(
                #     lat_mesh[ih], lon_mesh[ih], dep_mesh[iv], x_mesh, y_mesh, z_mesh, data)
                array_to_interpolate[ih, iv, :] = [
                    lon_mesh[ih], lat_mesh[ih], dep_mesh[iv]
                ]
    else:
        lat_mesh = np.linspace(minlat, maxlat, vnpts)
        lon_mesh = np.linspace(minlon, maxlon, hnpts)
        for ih in tqdm.tqdm(range(hnpts)):
            for iv in range(vnpts):
                # plot_values[ih, iv] = interp_value(
                #     lat_mesh[iv], lon_mesh[ih], mindep, x_mesh, y_mesh, z_mesh, data)
                array_to_interpolate[ih, iv, :] = [
                    lon_mesh[iv], lat_mesh[ih], mindep
                ]

    # build up the interpolation function
    interpolating_function = RegularGridInterpolator(
        (lon_list, lat_list, dep_list), data, method="nearest")
    plot_values = interpolating_function(array_to_interpolate)

    # * plot figures
    print("start to plot")
    plt.figure()
    # get vmin and vmax
    vmin_round = round(np.min(plot_values), 2)
    if (vmin_round < np.min(plot_values)):
        vmin = vmin_round
    else:
        vmin = vmin_round - 0.01
    vmax_round = round(np.max(plot_values), 2)
    if (vmax_round > np.max(plot_values)):
        vmax = vmax_round
    else:
        vmax = vmax_round + 0.01
    # ! set vmin and vmax here
    # vmin = -0.03
    # vmax = 0.03

    v = np.arange(vmin, vmax, 0.01)

    if (plot_vertically):
        # decide to use lat or lon
        lat_diff = np.abs(maxlat - minlat)
        lon_diff = np.abs(maxlon - minlon)
        plot_on = None
        if (lat_diff >= lon_diff):
            mesh_plot_h, mesh_plot_v = np.meshgrid(lat_mesh,
                                                   dep_mesh,
                                                   indexing="ij")
            plot_on = "latitude"
        else:
            mesh_plot_h, mesh_plot_v = np.meshgrid(lon_mesh,
                                                   dep_mesh,
                                                   indexing="ij")
            plot_on = "longitude"
        plt.contourf(mesh_plot_h,
                     mesh_plot_v,
                     plot_values,
                     resolution,
                     cmap=plt.cm.seismic_r,
                     vmin=vmin,
                     vmax=vmax)
        plt.colorbar(ticks=v, label="perturbation")
        plt.gca().invert_yaxis()
        plt.xlabel(
            f"{plot_on}(°) between (lon: {minlon}°, lat: {minlat}°) and (lon: {maxlon}°, lat: {maxlat}°)"
        )
        plt.ylabel("depth(km)")
        plt.title(f"parameter: {parameter}")
        plt.show()
    else:
        mesh_plot_h, mesh_plot_v = np.meshgrid(lon_mesh,
                                               lat_mesh,
                                               indexing="ij")
        plt.contourf(mesh_plot_h,
                     mesh_plot_v,
                     plot_values,
                     resolution,
                     cmap=plt.cm.seismic_r,
                     vmin=vmin,
                     vmax=vmax)
        plt.colorbar(ticks=v, label="perturbation")
        plt.gca().invert_yaxis()
        plt.ylabel(f"longitude(°) between {minlon}° and {maxlon}°")
        plt.ylabel(f"latitude(°) between {minlat}° and {maxlat}°")
        plt.title(f"depth: {mindep}km, parameter: {parameter}")
        plt.show()
Example #60
0
    def create_terms(self,
                     obo_path,
                     shortname,
                     attrs=["def", "synonym", "subset", "alt_id", "dbxref"]):
        self.cache = {}
        ids = []
        with open(
                obo_path
        ) as h:  # filter the ids because GODag iterates over alt_ids too
            for l in h.readlines():
                if l.startswith("id: " + shortname.upper() + ":"):
                    ids.append(l.split(" ")[1].strip())
        ids = set(ids)

        # PAra SO attrs = ["def", "subset", "dbxref", "alt_id"]

        go_dag = GODag(obo_path, load_obsolete=True, optional_attrs=attrs)

        finished = False
        pbar = iter(tqdm(ids))
        while not finished:
            with transaction.atomic():
                for _ in range(2000):
                    try:
                        go = next(pbar)
                        if go not in go_dag:
                            continue
                        term = go_dag[go]
                        if not Term.objects.filter(ontology=self.ontology,
                                                   identifier=go).exists():
                            dbTerm = Term(
                                name=term.name,
                                definition=term.defn
                                if hasattr(term, "defn") else "",
                                identifier=go,
                                is_obsolete="T" if term.is_obsolete else "F",
                                ontology=self.ontology)
                            dbTerm.save()
                            if term.namespace:
                                termdbref = TermDbxref(
                                    term=dbTerm,
                                    dbxref=Ontology.dbmap[term.namespace],
                                    rank=1)
                                termdbref.save()

                            for subset in term.subset:
                                if subset in Ontology.dbmap:
                                    termdbref = TermDbxref(
                                        term=dbTerm,
                                        dbxref=Ontology.dbmap[subset],
                                        rank=1)
                                    termdbref.save()
                            if hasattr(term, "synonym"):
                                for synonym in term.synonym:
                                    TermSynonym.objects.get_or_create(
                                        term=dbTerm, synonym=synonym[0][:255])

                            for synonym in term.alt_ids:
                                TermSynonym.objects.get_or_create(
                                    term=dbTerm, synonym=synonym[0][:255])

                            self.cache[go] = dbTerm
                        else:
                            self.cache[go] = Term.objects.filter(
                                ontology=self.ontology, identifier=go).get()
                            print("repeated: " + go)
                    except StopIteration:
                        finished = True