コード例 #1
0
    def oneD(self, nxdata, scan):
        '''*internal*: generic data parser for 1-D column data, returns signal and axis'''
        for column in scan.L:
            self.write_ds(nxdata, column, scan.data[column])

        signal = utils.clean_name(scan.column_last)  # primary Y axis
        axis = utils.clean_name(scan.column_first)  # primary X axis
        self.mca_spectra(nxdata, scan, axis)  # records any MCA data
        return signal, axis
コード例 #2
0
ファイル: writer.py プロジェクト: prjemian/spec2nexus
    def oneD(self, nxdata, scan):
        '''*internal*: generic data parser for 1-D column data, returns signal and axis'''
        for column in scan.L:
            self.write_ds(nxdata, column, scan.data[column])

        signal = utils.clean_name(scan.column_last)      # primary Y axis
        axis = utils.clean_name(scan.column_first)       # primary X axis
        self.mca_spectra(nxdata, scan, axis)             # records any MCA data
        return signal, axis
コード例 #3
0
ファイル: nrkdl.py プロジェクト: Hellowlol/nrkdl
    def download(self, path=None):
        if self.available is False:
            # print('Cant download %s' % c_ount(self.name))
            return

        url = self.media_url
        if url is None:
            return

        if path is None:
            path = SAVE_PATH

        folder = clean_name(self.name)

        try:
            # Make sure the show folder exists
            os.makedirs(os.path.join(path, folder))
        except OSError as e:
            if not os.path.isdir(os.path.join(path, folder)):
                raise

        fp = os.path.join(path, folder, self.file_name)
        q = 'high'  # fix me
        t = (url, q, fp)
        Downloader(self).add((url, q, fp))
        return t
コード例 #4
0
def process_midwest_html(fname, product_dict):
    '''Process the midwest html file'''
    with open(fname, 'r') as f:
        soup = BeautifulSoup(f)

    product_li = soup.find_all('li', {'class': 'account-listItem'})

    for product in product_li:
        titles = product.find('h5', {"class": "account-product-title"})
        try:
            # add any variables if there are any
            vars = product.find_all('dd', {'class': 'definitionList-value'})
            vars = " ".join([v.string for v in vars])
            if vars is None:
                name = titles.string
            else:
                name = titles.string + vars
            # change quantity to int
            quantity = int(name[0])
            name = name[4:]
            # clean name
            name = clean_name(name, replace_dict_noreg)
            # add to dict
            product_dict[name] = quantity
        except AttributeError as e:
            pass

    return product_dict
コード例 #5
0
ファイル: specplot.py プロジェクト: JPHammonds/spec2nexus
    def retrieve_plot_data(self):
        '''retrieve default data from spec data file'''
        '''
        data parser for 2-D mesh and hklmesh
        '''
        label1, start1, end1, intervals1, label2, start2, end2, intervals2, time = self.scan.scanCmd.split()[1:]
        if label1 not in self.scan.data:
            label1 = self.scan.L[0]      # mnemonic v. name
        if label2 not in self.scan.data:
            label2 = self.scan.L[1]      # mnemonic v. name
        axis1 = self.scan.data.get(label1)
        axis2 = self.scan.data.get(label2)
        intervals1, intervals2 = map(int, (intervals1, intervals2))
        start1, end1, start2, end2, time = map(float, (start1, end1, start2, end2, time))

        if len(axis1) < intervals1 and min(axis2) == max(axis2):
            # stopped scan before second row started, 1-D plot is better (issue #82)
            self.axes = [label1,]
            self.signal = self.scan.column_last
            self.data[label1] = self.scan.data[label1]
            self.data[self.signal] = self.scan.data[self.signal]
            return

        axis1 = axis1[0:intervals1+1]
        self.data[label1] = axis1    # 1-D array

        axis2 = [axis2[row] for row in range(len(axis2)) if row % (intervals1+1) == 0]
        self.data[label2] = axis2    # 1-D array

        column_labels = self.scan.L
        column_labels.remove(label1)    # special handling
        column_labels.remove(label2)    # special handling
        if self.scan.scanCmd.startswith('hkl'):
            # find the reciprocal space axis held constant
            label3 = [key for key in ('H', 'K', 'L') if key in column_labels][0]
            self.data[label3] = self.scan.data.get(label3)[0]    # constant

        # build 2-D data objects (do not build label1, label2, [or label3] as 2-D objects)
        data_shape = [len(axis2), len(axis1)]
        for label in column_labels:
            if label not in self.data:
                axis = numpy.array( self.scan.data.get(label) )
                self.data[label] = utils.reshape_data(axis, data_shape)
            else:
                pass

        self.signal = utils.clean_name(self.scan.column_last)
        self.axes = [label1, label2]
    
        if spec.MCA_DATA_KEY in self.scan.data:    # 3-D array(s)
            # save each spectrum
            for key, spectrum in sorted(self.scan.data[spec.MCA_DATA_KEY].items()):
                num_channels = len(spectrum[0])
                data_shape.append(num_channels)
                mca = numpy.array(spectrum)
                data = utils.reshape_data(mca, data_shape)
                channels = range(1, num_channels+1)
                ds_name = '_' + key + '_'
                self.data[ds_name] = data
                self.data[ds_name+'channel_'] = channels
コード例 #6
0
ファイル: w2v.py プロジェクト: jvalansi/word2vec_exp
 def evaluate_model(self, 
                    questions_fpath=os.path.join('res', 'model',
                                                 'questions-words.txt')):
     if clean_name(self.fname).endswith('pos'):
         pos_file(questions_fpath)
         questions_fpath = questions_fpath+'.pos' 
     return self.model.accuracy(questions_fpath)
コード例 #7
0
ファイル: w2v.py プロジェクト: jvalansi/word2vec_exp
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("-mn", "--model_name", help="model name", default='spanishEtiquetado.bin')
    parser.add_argument("-qn", "--questions_name", help="questions name", default='ambiguous_verbs.sp')
    parser.add_argument("-w", "--window", help="model window size", type=int, default=5)
    parser.add_argument("-n", "--n_proc", help="number of processes", type=int, default=4)
    args = parser.parse_args()
    
    word2vec.logger.setLevel(logging.DEBUG)
    
    w2v = W2V(args.model_name,n_proc=args.n_proc, window=args.window)
    pos_name = clean_name(args.model_name) +'.pos' + '.bin'
    w2v_pos = W2V(pos_name,n_proc=args.n_proc, window=args.window)

#     print(len(word2vec_exp.model.vocab))
#     print(word2vec_exp.model.vocab.items()[:10])
#     print(word2vec_exp.model.similarity('add_VB','remove_VB'))
#     print(len(model.vocab.keys()))    

    questions_fpath = os.path.join('res', 'mult', args.questions_name)
    print(datetime.datetime.now())
    eval1 = w2v.evaluate_model(questions_fpath)
    print(datetime.datetime.now())
    eval2 = w2v_pos.evaluate_model(questions_fpath)
    print(datetime.datetime.now())
    missing1, missing2 = compare_section(eval1, eval2, to_section_name(args.questions_name))
コード例 #8
0
    def download(self, path=None):
        if self.available is False:
            # print('Cant download %s' % c_ount(self.name))
            return

        url = self.media_url
        if url is None:
            return

        if path is None:
            path = SAVE_PATH

        folder = clean_name(self.name)

        try:
            # Make sure the show folder exists
            os.makedirs(os.path.join(path, folder))
        except OSError:
            if not os.path.isdir(os.path.join(path, folder)):
                raise

        fp = os.path.join(path, folder, self.file_name)
        q = 'high'  # fix me
        t = (url, q, fp)
        Downloader(self).add((url, q, fp))
        return t
コード例 #9
0
    def clean_results(self, results, imdb=False):
        subtitles = {}
        user_ranks = {  'administrator': 1,
                        'platinum member': 2,
                        'vip member': 3,
                        'gold member': 4,
                        'trusted': 5,
                        'silver member': 6,
                        'bronze member': 7,
                        'sub leecher': 8,
                        '': 9, }

        for result in results:
            if result['SubBad'] != '1':
                movie_hash = result.get('MovieHash')
                if not movie_hash:
                    movie_hash = self.imdbid_to_hash[int(result['IDMovieImdb'])]
                subid = result['IDSubtitleFile']
                downcount = int(result['SubDownloadsCnt'])
                rating = float(result['SubRating'])

                if rating and rating < 8:
                    # Ignore poorly rated subtitles, while not
                    # penalizing the ones that haven't yet been rated
                    continue

                user_rank = user_ranks[result['UserRank']]

                if imdb:
                    cleaned_release_name = utils.clean_name(result['MovieReleaseName'])
                    file_name = self.moviefiles[movie_hash]['file_name']
                    cleaned_file_name = utils.clean_name(file_name)
                    overlap = len(set.intersection(set(cleaned_release_name), set(cleaned_file_name)))
                else:
                    overlap = 0

                subtitles.setdefault(movie_hash, []).append({
                    'subid': subid,
                    'downcount': downcount,
                    'rating': rating,
                    'user_rank': user_rank,
                    'overlap' : overlap
                    })

        return subtitles
コード例 #10
0
def compute_position(event, context):

    # Load Price from s3 bucket
    bucket_name = environ.get("BUCKET")
    usdt_file_name = environ.get("USDT_PRICE_FILE_NAME")
    client = boto3.client("s3")
    file = client.get_object(Bucket=bucket_name, Key=usdt_file_name)
    price_df = pd.read_csv(file["Body"], compression='gzip')

    order_history = json.loads(event['body'])['data']
    order_df = pd.DataFrame(order_history)
    order_df = order_df.apply(pd.to_numeric, errors='ignore')
    order_df = order_df[order_df['executedQty'] > 0]
    order_df = order_df[[
        'symbol', 'executedQty', 'side', 'updateTime', 'price'
    ]]
    order_df['updateTime'] = order_df['updateTime'].apply(convert_to_datetime)

    order_df['symbol'] = order_df['symbol'].apply(clean_name)
    order_df.reset_index(inplace=True, drop=True)
    order_df['symbol'] = order_df['symbol'] + "USDT_close"
    order_df['side'] = np.where(order_df['side'] == "BUY", 1, -1)

    order_df['executedQty'] = order_df['executedQty'] * order_df['side']

    price_df = price_df[list(order_df['symbol'].unique()) + ['timestamp']]
    price_df['timestamp'] = pd.to_datetime(price_df['timestamp'])
    price_df = price_df[price_df['timestamp'] >= order_df['updateTime'].min() -
                        timedelta(hours=4)]  # [Potential Bug]
    price_df.set_index('timestamp', inplace=True)

    pos_df = pd.DataFrame(columns=price_df.columns)
    for t in price_df.index:
        temp = order_df[order_df['updateTime'] < t]
        temp = temp.groupby('symbol').sum()['executedQty']
        pos_df = pos_df.append(temp)
    pos_df.index = price_df.index
    nav_df = price_df.multiply(pos_df, axis=0)
    nav_df.index = pd.to_datetime(nav_df.index, "%Y-%m-%d")
    nav_df.columns = [clean_name(n) for n in list(nav_df.columns)]
    nav_df = nav_df.fillna(0)
    nav_timeseries_data = []
    for t in nav_df.index:
        unix_secs = mktime(t.timetuple())
        for col in nav_df.columns:
            nav_timeseries_data.append([unix_secs, col, nav_df.loc[t, col]])

    response = {
        "statusCode": "200",
        "headers": {
            'Access-Control-Allow-Origin': "*",
            'Access-Control-Allow-Credentials': True,
        },
        "body": json.dumps({"data": nav_timeseries_data})
    }

    return response
コード例 #11
0
def gen_one_test_feature():
    # process test data and save in pickle
    # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}}
    valid_nuass = load_json(VALID_UNASS_PATH)
    valid_pub = load_json(VALID_PUB_PATH)
    # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    aid2yearinfo = load_pickle(
        os.path.join(NEW_DATA_V2_DIR, 'aid2yearinfo.pkl'))
    aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl'))
    aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))
    aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl'))
    aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))
    aid2orgwithyear = load_pickle(
        os.path.join(NEW_DATA_V2_DIR, 'aid2orgwithyear.pkl'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    # aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))

    testdatafeatures = {}
    all_authors_name = list(name2aids.keys())
    all_aids = []
    for key in name2aids:
        aids = name2aids[key]
        all_aids.extend(aids.tolist())
    all_aids = np.array(all_aids)
    for pid_with_index in tqdm.tqdm(valid_nuass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = valid_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        index = get_name_index(author_name, all_authors_name)
        author_name = all_authors_name[index]

        candidate_aids = name2aids[author_name]
        candidate_aids = all_aids
        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            print(aid)
            new_pair = (aid, pid_with_index)
            pid_info_dict = valid_pub[now_pid]
            aid_author_info_dict = aid2coauthor[aid]
            aid_year_info_dict = aid2year[aid]
            aid_venue_dict = aid2venue[aid]
            aid_org_year_list = aid2orgwithyear[aid]
            aid_keywords_dict = aid2keywords[aid]
            aid_year_all_info_dict = aid2yearinfo[aid]
            data.append(
                get_features(new_pair, pid_info_dict, aid_author_info_dict,
                             aid_year_info_dict, aid_venue_dict,
                             aid_org_year_list, aid_keywords_dict,
                             aid_year_all_info_dict))
        data = np.array(data)
        inner_dict['data'] = data
        testdatafeatures[pid_with_index] = inner_dict
        break
    save_pickle(testdatafeatures, './testdatafeatures_one.pkl')
コード例 #12
0
def gen_test_feature():
    # process test data and save in pickle
    # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}}
    valid_nuass = load_json(VALID_UNASS_PATH)
    valid_pub = load_json(VALID_PUB_PATH)
    # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    aid2yearinfo = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2yearinfo.pkl'))
    aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl'))
    aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))
    aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl'))
    aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))
    aid2orgwithyear = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))
    aid2orgset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl'))
    aid2venueset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venueset.pkl'))
    aid2keywordsset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl'))

    all_pids_len = 0
    for aid in aid2pids:
        all_pids_len += len(aid2pids[aid])
    testdatafeatures = {}
    all_authors_name = list(name2aids.keys())
    # author_name_count = defaultdict(int)
    for pid_with_index in tqdm.tqdm(valid_nuass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = valid_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        index = get_name_index(author_name, all_authors_name)
        author_name = all_authors_name[index]
        # author_name_count[author_name] += 1
        # continue

        candidate_aids = name2aids[author_name]
        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            new_pair = (aid, pid_with_index)
            pid_info_dict = valid_pub[now_pid]
            aid_author_info_dict = aid2coauthor[aid]
            aid_year_info_dict = aid2year[aid]
            aid_venue_dict = aid2venue[aid]
            aid_org_year_list = aid2orgwithyear[aid]
            aid_keywords_dict = aid2keywords[aid]
            aid_year_all_info_dict = aid2yearinfo[aid]
            org_info_set = aid2orgset[aid]
            aid_venue_set = aid2venueset[aid]
            aid_keywords_set = aid2keywordsset[aid]
            data.append(get_features(new_pair, pid_info_dict, aid_author_info_dict, aid_year_info_dict, aid_venue_dict, aid_org_year_list, aid_keywords_dict, aid_year_all_info_dict, org_info_set, aid_venue_set, aid_keywords_set))
            data[-1].append(len(aid2pids[aid]) / all_pids_len)
        data = np.array(data)
        inner_dict['data'] = data
        testdatafeatures[pid_with_index] = inner_dict
    save_pickle(testdatafeatures, os.path.join(TEST_FEATURE_DIR_V2, 'testdatafeatures-withsetinfo-papercount.pkl'))
コード例 #13
0
def get_coauthor_count_for_enhence(aid_pid_pair, aid_author_info_dict,
                                   pid_info_dict):
    index = int(aid_pid_pair[1].split('-')[1])
    authors = pid_info_dict['authors']
    authors = [clean_name(item['name']) for item in authors]
    authors.pop(index)
    count = 0
    for author_name in authors:
        if author_name in aid_author_info_dict.keys():
            count += 1
    return count
コード例 #14
0
ファイル: w2v.py プロジェクト: jvalansi/word2vec_exp
    def create_model(self, fname, max_news=99, n_proc=1, window=5, splits=100):
        name = clean_name(fname)
        model = word2vec.Word2Vec(window=window, workers=n_proc)
        if name == 'text8':
            sentences = word2vec.Text8Corpus(os.path.join('res', 'model', 'text8'))
            model.train(sentences)
        elif name == 'brown':
        #     sentences = word2vec.BrownCorpus(fpath)
            sentences = brown.sents()
            model.train(sentences)
        elif name.startswith('news'):
            target_fpath = os.path.join('res', 'model', name+'.txt')
            if not os.path.exists(target_fpath):
                build_news_corpus(name, max_news, n_proc, target_fpath)
            sentences = word2vec.LineSentence(target_fpath)
            model.build_vocab(sentences)
            model.train(sentences)
#         elif name.startswith('wikipedia.deps'):
#             target_fpath = os.path.join('res', 'model', name+'.txt')
#             if not os.path.exists(target_fpath):
#                 build_wikipedia_corpus(name, max_news, n_proc, target_fpath)
        elif name.startswith('spanishEtiquetado'):
            target_fpath = os.path.join('res', 'model', name+'.txt')
            if not os.path.exists(target_fpath):
                path = os.path.join('res', 'model', 'spanishEtiquetado')
                max_pos_len = re.search('\d+', name)
                if max_pos_len:
                    max_pos_len = int(max_pos_len.group(0))
                build_corpus(path, name.endswith('pos'), target_fpath, max_pos_len)
            sentences = word2vec.LineSentence(target_fpath)
#             with open(target_fpath) as fp:
#                 sentences = fp.readlines()
            model.build_vocab(sentences)
            model.train(sentences)        
        else:
            target_fpath = os.path.join('res', 'model', name+'.txt')
            file_to_lower(target_fpath)
            sentences = word2vec.LineSentence(target_fpath)
            model.build_vocab(sentences)
            model.train(sentences)
#             n_sents = len(sentences)  
#             print(n_sents)
#             if splits == 0:
#                 splits = 1
#             split_size = int(n_sents/splits)
#             for i in range(splits):
#                 print(str(i) + '\r')
#                 split_sentences = sentences[i*split_size:(i+1)*split_size-1]
#                 model.save_word2vec_format(os.path.join('res', 'model', fname), binary=fname.endswith('.bin'))
#                 model.save()  
                         
    #     model.save(os.path.join('res',name+'.model'))
        model.save_word2vec_format(os.path.join('res', 'model', fname), binary=fname.endswith('.bin'))
コード例 #15
0
    def __init__(self, data, *args, **kwargs):
        self.data = data
        self.name = data.get('name', '') or data.get('title', '')
        self.name = self.name.strip()
        self.title = data.get('title', '')
        self.type = data.get('type')
        self.id = data.get('id')
        self.available = data.get('isAvailable', False)
        self._image_url = "http://m.nrk.no/m/img?kaleidoId=%s&width=%d"
        if self.data.get('episodeNumberOrDate'):
            self.full_title = '%s %s' % (self.name, self._fix_sn(self.data.get('seasonId'), season_ids=kwargs.get('seasonIds')))
        else:
            self.full_title = self.title

        self.file_name = self._filename(self.full_title)
        self.file_path = os.path.join(SAVE_PATH, clean_name(self.name), self.file_name)
        self._image_id = data.get('imageId') or kwargs.get('imageId')
コード例 #16
0
ファイル: nrkdl.py プロジェクト: Hellowlol/nrkdl
    def __init__(self, data, *args, **kwargs):
        self.data = data
        self.name = data.get('name', '') or data.get('title', '')
        self.name = self.name.strip()
        self.title = data.get('title', '')
        self.type = data.get('type')
        self.id = data.get('id')
        self.available = data.get('isAvailable', False)
        self._image_url = "http://m.nrk.no/m/img?kaleidoId=%s&width=%d"

        if self.data.get('episodeNumberOrDate'):
            self.full_title = '%s %s' % (self.name, self._fix_sn(self.data.get('seasonId'), season_ids=kwargs.get('seasonIds')))
        else:
            self.full_title = self.title

        self.file_name = self._filename(self.full_title)
        self.file_path = os.path.join(SAVE_PATH, clean_name(self.name), self.file_name)
        self._image_id = data.get('imageId') or kwargs.get('imageId')
コード例 #17
0
def gen_test_title_abstract_vec(mission='title'):
    if mission == 'title':
        aid2cate = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl'))
    elif mission == 'abstract':
        aid2cate = load_pickle(
            os.path.join(NEW_DATA_DIR, 'aid2abstractvec.pkl'))
    else:
        raise ValueError('mission value error')

    valid_nuass = load_json(VALID_UNASS_PATH)
    valid_pub = load_json(VALID_PUB_PATH)
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    texttovec = TextToVec()

    all_authors_name = list(name2aids.keys())
    # test_cate_feature --> {pid-with-index: {candidate-aids: [...], data: [(emb0, meb1), ...]}}
    test_cate_feature = {}
    for pid_with_index in tqdm.tqdm(valid_nuass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = valid_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        index = get_name_index(author_name, all_authors_name)
        author_name = all_authors_name[index]

        candidate_aids = name2aids[author_name]
        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            info = valid_pub[now_pid].get(mission)
            if info is None:
                emb = np.zeros(300)
            else:
                emb = texttovec.get_vec(info)
            emb_pair = (aid2cate[aid], emb)
            data.append(emb_pair)
        inner_dict['data'] = data
        test_cate_feature[pid_with_index] = inner_dict
    save_pickle(
        test_cate_feature,
        os.path.join(TEST_FEATURE_DIR_V2, 'test-%s-emb-pair.pkl' % mission))
コード例 #18
0
def check_name():
    problem_pids = load_json(os.path.join(FINAL_DIR, 'problem.pids.3.json'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    test_pub = load_json(TEST_PUB_PATH)
    all_authors_name = list(name2aids.keys())

    name_map = []
    for pid_with_index in tqdm.tqdm(problem_pids):
        now_pid, index = pid_with_index.split('-')
        author_name_no_clean = test_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name_no_clean)
        if pid_with_index == 'ToCcabLT-1':
            author_name = 'junliang_wang'
        if pid_with_index == 'cVvvcFzj-1':
            author_name = 'xiaojun_liu'

        index = get_name_index(author_name, all_authors_name)
        author_name_inlist = all_authors_name[index]
        # if author_name_inlist != author_name:
        name_map.append((pid_with_index, author_name_no_clean, author_name,
                         author_name_inlist))
    name_map = list(set(name_map))
    print(len(name_map))
    save_json(name_map, os.path.join(FINAL_DIR, 'name.different.3.json'))
コード例 #19
0
            continue

        ms_song_id = dir_name.split("/")[-1]
        if ms_song_id not in songs_info:
            print("========== SONG", ms_song_id,
                  "NOT IN SONGS INFO DATABASE ============")
            continue

        # A song directory might contain more than one version, select one
        selected_version_path = get_version_with_highest_unr(dir_name, files)
        #selected_version_md5 = os.path.splitext(selected_version_path)[0].split("/")[-1]

        with open(selected_version_path, "rb") as midi_file:
            selected_version_md5 = hashlib.md5(midi_file.read()).hexdigest()

        # Check for duplicates
        if selected_version_md5 not in songs:
            songs[selected_version_md5] = selected_version_path

            # Get song and artist names
            h5 = ms.hdf5_getters.open_h5_file_read(songs_info[ms_song_id])

            song_name = clean_name(ms.hdf5_getters.get_title(h5))
            artist_name = clean_name(ms.hdf5_getters.get_artist_name(h5))

            h5.close()

            print("Adding song", song_name, "by", artist_name)
            add_song(selected_version_path, song_name, artist_name, opt.out,
                     genre_mapping)
コード例 #20
0
 def _filename(self, name=None):
     name = clean_name('%s' % name or self.full_title)
     name = name.replace(' ', '.') + '.WEBDL-nrkdl'
     return name
コード例 #21
0
ファイル: info_hell.py プロジェクト: Wikimedia-Sverige/SMV
 def generate_depicted_people(self):
     depicted = " / ".join(
         [utils.clean_name(x["name"]) for x in self.depicted])
     return depicted
コード例 #22
0
def gen_base_feature(index, multi_size):
    # process test data and save in pickle
    # testdatafeatures --> {pid-with-index: {candidate-aids: [...], data: [[xxx], [xxx], [xxx]...]}}
    test_unass = load_json(TEST_UNASS_PATH)
    test_pub = load_json(TEST_PUB_PATH)
    # whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    aid2yearinfo = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2yearinfo.pkl'))
    aid2coauthor = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2coauthor.pkl'))
    aid2venue = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))
    aid2keywords = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2keywords.pkl'))
    aid2year = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))
    aid2orgwithyear = load_pickle(
        os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl'))
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    # aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))
    aid2orgset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl'))
    aid2venueset = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2venueset.pkl'))
    aid2keywordsset = load_pickle(
        os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl'))

    name_map = load_json(
        os.path.join(FINAL_DIR, 'name.different.modified.json'))
    original_name = [pair[0] for pair in name_map]
    changed_name = [pair[1] for pair in name_map]
    name_map2 = load_json(
        os.path.join(FINAL_DIR, 'name.different.2.modified.json'))
    original_name2 = [pair[0] for pair in name_map2]
    changed_name2 = [pair[1] for pair in name_map2]

    single_range = math.ceil(len(test_unass) / multi_size)
    start = index * single_range
    end = (index + 1) * single_range if (index + 1) * single_range < len(
        test_unass) else len(test_unass)

    testdatafeatures = {}
    all_authors_name = list(name2aids.keys())
    print('Gen test features ...')
    for pid_with_index in tqdm.tqdm(test_unass[start:end]):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = test_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        if pid_with_index == 'ToCcabLT-1':
            author_name = 'junliang_wang'
        if pid_with_index == 'cVvvcFzj-1':
            author_name = 'xiaojun_liu'

        if author_name in original_name2:
            name_index = original_name2.index(author_name)
            author_name = changed_name2[name_index]
        elif author_name in original_name:
            name_index = original_name.index(author_name)
            author_name = changed_name[name_index]
        else:
            index = get_name_index(author_name, all_authors_name)
            author_name = all_authors_name[index]

        if isinstance(author_name, str):
            candidate_aids = name2aids[author_name]
        elif isinstance(author_name, list):
            candidate_aids = []
            for name in author_name:
                candidate_aids.extend(name2aids[name].tolist())
            candidate_aids = np.array(candidate_aids)
        else:
            raise ValueError("check author name ! ! !")

        inner_dict['candidate-aids'] = candidate_aids
        data = []
        for aid in candidate_aids:
            new_pair = (aid, pid_with_index)
            pid_info_dict = test_pub[now_pid]
            aid_author_info_dict = aid2coauthor[aid]
            aid_year_info_dict = aid2year[aid]
            aid_venue_dict = aid2venue[aid]
            aid_org_year_list = aid2orgwithyear[aid]
            aid_keywords_dict = aid2keywords[aid]
            aid_year_all_info_dict = aid2yearinfo[aid]
            org_info_set = aid2orgset[aid]
            aid_venue_set = aid2venueset[aid]
            aid_keywords_set = aid2keywordsset[aid]
            data.append(
                get_features(new_pair, pid_info_dict, aid_author_info_dict,
                             aid_year_info_dict, aid_venue_dict,
                             aid_org_year_list, aid_keywords_dict,
                             aid_year_all_info_dict, org_info_set,
                             aid_venue_set, aid_keywords_set))
        data = np.array(data)
        inner_dict['data'] = data
        testdatafeatures[pid_with_index] = inner_dict
    # save_pickle(testdatafeatures, os.path.join(TEST_FEATURE_DIR, 'u6uRzaff-5.pkl'))
    return testdatafeatures
コード例 #23
0
ファイル: nrkdl.py プロジェクト: Hellowlol/nrkdl
 def _filename(self, name=None):
     name = clean_name('%s' % name or self.full_title)
     name = name.replace(' ', '.') + '.WEBDL-nrkdl'
     return name
コード例 #24
0
ファイル: writer.py プロジェクト: prjemian/spec2nexus
 def write_ds(self, group, label, data, **attr):
     '''*internal*: writes a dataset to the HDF5 file, records the SPEC name as an attribute'''
     clean_name = utils.clean_name(label)
     eznx.write_dataset(group, clean_name, data, spec_name=label, **attr)
コード例 #25
0
ファイル: writer.py プロジェクト: prjemian/spec2nexus
    def mesh(self, nxdata, scan):
        '''*internal*: data parser for 2-D mesh and hklmesh'''
        # 2-D parser: http://www.certif.com/spec_help/mesh.html
        # mesh motor1 start1 end1 intervals1 motor2 start2 end2 intervals2 time
        # 2-D parser: http://www.certif.com/spec_help/hklmesh.html
        #  hklmesh Q1 start1 end1 intervals1 Q2 start2 end2 intervals2 time
        # mesh:    data/33id_spec.dat  scan 22
        # hklmesh: data/33bm_spec.dat  scan 17
        signal, axes = '', ['',]
        
        label1, start1, end1, intervals1, label2, start2, end2, intervals2, time = scan.scanCmd.split()[1:]
        if label1 not in scan.data:
            label1 = scan.L[0]      # mnemonic v. name
        if label2 not in scan.data:
            label2 = scan.L[1]      # mnemonic v. name
        axis1 = scan.data.get(label1)
        axis2 = scan.data.get(label2)
        intervals1, intervals2 = map(int, (intervals1, intervals2))
        start1, end1, start2, end2, time = map(float, (start1, end1, start2, end2, time))
        if len(axis1) < intervals1:     # stopped scan before second row started
            signal, axes = self.oneD(nxdata, scan)        # fallback support
        else:
            axis1 = axis1[0:intervals1+1]
            axis2 = [axis2[row] for row in range(len(axis2)) if row % (intervals1+1) == 0]

            column_labels = scan.L
            column_labels.remove(label1)    # special handling
            column_labels.remove(label2)    # special handling
            if scan.scanCmd.startswith('hkl'):
                # find the reciprocal space axis held constant
                label3 = [key for key in ('H', 'K', 'L') if key not in (label1, label2)][0]
                axis3 = scan.data.get(label3)[0]
                self.write_ds(nxdata, label3, axis3)

            self.write_ds(nxdata, label1, axis1)    # 1-D array
            self.write_ds(nxdata, label2, axis2)    # 1-D array

            # build 2-D data objects (do not build label1, label2, [or label3] as 2-D objects)
            data_shape = [len(axis1), len(axis2)]
            for label in column_labels:
                if label not in nxdata:
                    axis = np.array( scan.data.get(label) )
                    self.write_ds(nxdata, label, utils.reshape_data(axis, data_shape))
                else:
                    pass

            signal = utils.clean_name(scan.column_last)
            axes = ':'.join([label1, label2])

        if '_mca_' in scan.data:    # 3-D array(s)
            # save each spectrum
            for key, spectrum in sorted(scan.data['_mca_'].items()):
                num_channels = len(spectrum[0])
                data_shape.append(num_channels)
                mca = np.array(spectrum)
                data = utils.reshape_data(mca, data_shape)
                channels = range(1, num_channels+1)
                ds_name = '_' + key + '_'
                self.write_ds(nxdata, ds_name, data, axes=axes+':'+ds_name+'channel_', units='counts')
                self.write_ds(nxdata, ds_name+'channel_', channels, units='channel')

        return signal, axes
コード例 #26
0
def main():
    with Halo("Setting up script details.", spinner="dots") as spinner:
        league_id = os.environ.get("SLEEPER_LEAGUE_ID", None)
        user_id = os.environ.get("SLEEPER_USER_ID", None)

        args = parser.parse_args()
        command_args = dict(vars(args))
        is_dry_run = command_args.pop("dry_run", None)
        keep_positions = tuple(command_args.pop("positions", None))
        spinner.succeed()

    Halo(f"Included positions are {', '.join(keep_positions)}",
         spinner="dots").succeed()
    league = League(league_id)
    players = Players()

    league_rosters = league.get_rosters()
    if is_dry_run:
        all_players = players.get_all_players()
        with open("./data/sleeper_players_current.json", "w") as outfile:
            json.dump(all_players, outfile)
    else:
        with open("./data/sleeper_players_current.json", "r") as infile:
            all_players = json.load(infile)

    own_team = [
        team for team in league_rosters if team["owner_id"] == user_id
    ].pop()
    own_players = own_team["players"]
    keep_players = {
        p_id: p_data
        for p_id, p_data in all_players.items()
        if p_data["position"] in keep_positions
    }
    # save keep_players for testing
    with open("./data/sleeper_players_keep.json", "w") as outfile:
        json.dump(keep_players, outfile)
    # ID free agents by comparing keep_players to rosters
    rostered_player_ids = [
        player for team in league_rosters for player in team["players"]
    ]
    with Halo("Separating players into rostered and FAs.",
              spinner="dots") as spinner:
        free_agents = {
            p_id: p_data
            for p_id, p_data in keep_players.items()
            if p_id not in rostered_player_ids
        }
        rostered_players = {
            p_id: p_data
            for p_id, p_data in keep_players.items()
            if p_id in rostered_player_ids
        }
        spinner.succeed()

    with Halo("Pulling Numberfire Projections", spinner="dots") as spinner:
        nfp = numberfireProjections("half_ppr")
        nfp.get_data("flex")
        nfp.convert_projections()
        spinner.succeed()

    nf_cleaned_names = {clean_name(x): x for x in nfp.projections.keys()}
    # add projections in to rosters
    for p_id, p_data in free_agents.items():
        if p_data["search_full_name"] in nf_cleaned_names.keys():
            p_data["numberfire_projections"] = nfp.projections[
                nf_cleaned_names[p_data["search_full_name"]]]
        else:
            p_data["numberfire_projections"] = 0

    for p_id, p_data in rostered_players.items():
        if p_data["search_full_name"] in nf_cleaned_names.keys():
            p_data["numberfire_projections"] = nfp.projections[
                nf_cleaned_names[p_data["search_full_name"]]]
        else:
            p_data["numberfire_projections"] = 0
    Halo("Added projections to FAs and rostered players.",
         spinner="dots").succeed()

    # comparison
    own_roster = {
        p_id: p_data
        for p_id, p_data in rostered_players.items() if p_id in own_players
    }
    waiver_players = dict()
    for p_id, p_data in own_roster.items():
        if p_data["status"] == "Injured Reserve":
            continue
        waiver_dict = {
            "drop_proj": p_data["numberfire_projections"],
            "players_to_add": list(),
        }
        for fa_id, fa_data in free_agents.items():
            if (fa_data["numberfire_projections"] >
                    p_data["numberfire_projections"]) and (
                        fa_data["position"] == p_data["position"]):
                fa_dict = {
                    "waiver_player": fa_data["search_full_name"],
                    "waiver_proj": fa_data["numberfire_projections"],
                }
                waiver_dict["players_to_add"].append(fa_dict)
        waiver_players[p_data["search_full_name"]] = waiver_dict
    Halo(
        "Compared FA projections to your roster. Returning players with better projections.",
        spinner="dots",
    ).succeed()

    pp = pprint.PrettyPrinter()
    pp.pprint(waiver_players)
コード例 #27
0
    def mesh(self, nxdata, scan):
        '''*internal*: data parser for 2-D mesh and hklmesh'''
        # TODO: refactor to use NeXus data model: signal, axes, data

        # 2-D parser: http://www.certif.com/spec_help/mesh.html
        # mesh motor1 start1 end1 intervals1 motor2 start2 end2 intervals2 time
        # 2-D parser: http://www.certif.com/spec_help/hklmesh.html
        #  hklmesh Q1 start1 end1 intervals1 Q2 start2 end2 intervals2 time
        # mesh:    data/33id_spec.dat  scan 22
        # hklmesh: data/33bm_spec.dat  scan 17
        signal, axes = '', [
            '',
        ]

        label1, start1, end1, intervals1, label2, start2, end2, intervals2, time = scan.scanCmd.split(
        )[1:]
        if label1 not in scan.data:
            label1 = scan.L[0]  # mnemonic v. name
        if label2 not in scan.data:
            label2 = scan.L[1]  # mnemonic v. name
        axis1 = scan.data.get(label1)
        axis2 = scan.data.get(label2)
        intervals1, intervals2 = map(int, (intervals1, intervals2))
        start1, end1, start2, end2, time = map(
            float, (start1, end1, start2, end2, time))
        if len(axis1) < intervals1:  # stopped scan before second row started
            signal, axes = self.oneD(nxdata, scan)  # fallback support
        else:
            axis1 = axis1[0:intervals1 + 1]
            axis2 = [
                axis2[row] for row in range(len(axis2))
                if row % (intervals1 + 1) == 0
            ]

            column_labels = scan.L
            column_labels.remove(label1)  # special handling
            column_labels.remove(label2)  # special handling
            if scan.scanCmd.startswith('hkl'):
                # find the reciprocal space axis held constant
                label3 = [
                    key for key in ('H', 'K', 'L')
                    if key not in (label1, label2)
                ][0]
                axis3 = scan.data.get(label3)[0]
                self.write_ds(nxdata, label3, axis3)

            self.write_ds(nxdata, label1, axis1)  # 1-D array
            self.write_ds(nxdata, label2, axis2)  # 1-D array

            # build 2-D data objects (do not build label1, label2, [or label3] as 2-D objects)
            data_shape = [len(axis1), len(axis2)]
            for label in column_labels:
                if label not in nxdata:
                    axis = np.array(scan.data.get(label))
                    self.write_ds(nxdata, label,
                                  converters.reshape_data(axis, data_shape))
                else:
                    pass

            signal = utils.clean_name(scan.column_last)
            axes = ':'.join([label1, label2])

        if spec.MCA_DATA_KEY in scan.data:  # 3-D array(s)
            # save each spectrum
            for key, spectrum in sorted(scan.data[spec.MCA_DATA_KEY].items()):
                num_channels = len(spectrum[0])
                data_shape.append(num_channels)
                mca = np.array(spectrum)
                data = converters.reshape_data(mca, data_shape)
                channels = range(1, num_channels + 1)
                ds_name = '_' + key + '_'
                self.write_ds(nxdata,
                              ds_name,
                              data,
                              axes=axes + ':' + ds_name + 'channel_',
                              units='counts')
                self.write_ds(nxdata,
                              ds_name + 'channel_',
                              channels,
                              units='channel')

        return signal, axes
コード例 #28
0
 def write_ds(self, group, label, data, **attr):
     '''*internal*: writes a dataset to the HDF5 file, records the SPEC name as an attribute'''
     clean_name = utils.clean_name(label)
     eznx.write_dataset(group, clean_name, data, spec_name=label, **attr)
コード例 #29
0
def preprocessing(mission='train'):
    # os.makedirs(NEW_DATA_DIR, exist_ok=True)

    # ------------------------------------------
    # process whole_author_profile.json, add index, and save to pickle
    # save format: name2aids --> {name: [aids, ...]}, aid2pids --> {aid: [pid-index, ...]}
    os.makedirs(NEW_DATA_DIR, exist_ok=True)
    os.makedirs(NEW_DATA_V2_DIR, exist_ok=True)
    if mission == 'train':
        whole_author_profile = load_json(
            os.path.join(SPLIT_DIR, 'train_profile-last1year.json'))
    elif mission == 'test':
        whole_author_profile = load_json(WHOLE_AUTHOR_PROFILE_PATH)
    else:
        raise ValueError("check mission value")
    whole_author_profile_pub = load_json(WHOLE_AUTHOR_PROFILE_PUB_PATH)
    name2aids = {}
    aid2pids = {}
    aids = []
    names = []
    pids_with_index = []
    for aid in tqdm.tqdm(whole_author_profile):
        aids.append(aid)
        names.append(whole_author_profile[aid]['name'])
        pids = whole_author_profile[aid]['papers']
        tmp = []
        for paper in pids:
            paper_authors = whole_author_profile_pub[paper]['authors']
            author_names = [clean_name(item['name']) for item in paper_authors]
            # print(author_names)
            index = get_name_index(names[-1], author_names)
            tmp.append('%s-%d' % (paper, index))
        pids_with_index.append(tmp)
    assert len(aids) == len(names)
    assert len(names) == len(pids_with_index)
    print('all aids num: ', len(aids))
    name_set = set(names)
    names_array = np.array(names)
    aids_array = np.array(aids)
    for name in name_set:
        target_aid = aids_array[names_array == name]
        name2aids[name] = target_aid
    for aid, pid in zip(aids, pids_with_index):
        aid2pids[aid] = pid
    if mission == 'train':
        save_pickle(name2aids, os.path.join(NEW_DATA_V2_DIR, 'name2aids.pkl'))
        save_pickle(aid2pids, os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl'))
    elif mission == 'test':
        save_pickle(name2aids, os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
        save_pickle(aid2pids, os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))

    # ------------------------------------------
    # save format: aid2year --> {aid: {min: xxx, max: xxx, mean: xxx, median: xxx, min_max_avg: xxx, year_list: [year, ...]}}
    if mission == 'train':
        aid2pids = load_pickle(os.path.join(NEW_DATA_V2_DIR, 'aid2pids.pkl'))
    elif mission == 'test':
        aid2pids = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2pids.pkl'))
    aid2year = {}
    print('Process year info ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        all_years = []
        for pid_with_index in pids:
            pid = pid_with_index.split('-')[0]
            year = whole_author_profile_pub[pid].get('year', '0')
            if year == '':
                year = 0
            else:
                year = int(year)
            if any([year < 1500, year > 2100]):
                year = 0
            all_years.append(year)
        all_years = np.array(all_years)
        all_years = all_years[all_years != 0]
        if len(all_years) == 0:
            year_info = None
        else:
            year_info = {
                'min': np.min(all_years),
                'max': np.max(all_years),
                'mean': np.mean(all_years),
                'min_max_avg': (np.min(all_years) + np.max(all_years)) / 2,
                'median': np.median(all_years),
                'year_list': all_years,
            }
        aid2year[aid] = year_info
    if mission == 'train':
        save_pickle(aid2year, os.path.join(NEW_DATA_V2_DIR, 'aid2year.pkl'))
    elif mission == 'test':
        save_pickle(aid2year, os.path.join(NEW_DATA_DIR, 'aid2year.pkl'))

    # ------------------------------------------
    # save format: aid2coauthor --> {aid: {anuthor-name: count, ...}}
    aid2coauthor = {}
    print('aid2coauthor processing ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_dict = defaultdict(int)
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            authors = whole_author_profile_pub[pid]['authors']
            authors_name = [clean_name(item['name']) for item in authors]
            authors_name.pop(int(index))
            for name in authors_name:
                inner_dict[name] += 1
        aid2coauthor[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2coauthor,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2coauthor.pkl'))
    elif mission == 'test':
        save_pickle(aid2coauthor, os.path.join(NEW_DATA_DIR,
                                               'aid2coauthor.pkl'))

    # ------------------------------------------
    # save format: aid2venue --> {aid: {venue-name: count ...}}
    aid2venue = {}
    print('aid2venue processing ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_dict = defaultdict(int)
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            venue = whole_author_profile_pub[pid]['venue'].lower()
            if venue != '':
                # aid2venue[aid].add(venue)
                inner_dict[venue] += 1
        aid2venue[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2venue, os.path.join(NEW_DATA_V2_DIR, 'aid2venue.pkl'))
    elif mission == 'test':
        save_pickle(aid2venue, os.path.join(NEW_DATA_DIR, 'aid2venue.pkl'))

    # ------------------------------------------
    # save format: aid2keywords --> {aid: {keyword: count, ...}}
    aid2keywords = {}
    print('aid2keywords processing ...')
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_dict = defaultdict(int)
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            keywords = whole_author_profile_pub[pid].get('keywords', '')
            if len(keywords) == 0:
                continue
            for keyword in keywords:
                if keyword != '':
                    # aid2keywords[aid].add(keyword.lower())
                    inner_dict[keyword] += 1
        aid2keywords[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2keywords,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2keywords.pkl'))
    elif mission == 'test':
        save_pickle(aid2keywords, os.path.join(NEW_DATA_DIR,
                                               'aid2keywords.pkl'))

    # ------------------------------------------
    # save format: aid2orgset--> {aid: set{org_word, org_word, ...}}
    aid2orgset = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_set = set()
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            author = whole_author_profile_pub[pid].get('authors')[int(index)]
            org = author.get('org', '').lower().strip()
            org_set = set(org.split())
            inner_set = inner_set | org_set
        aid2orgset[aid] = inner_set
    if mission == 'train':
        save_pickle(aid2orgset, os.path.join(NEW_DATA_V2_DIR,
                                             'aid2orgset.pkl'))
    elif mission == 'test':
        save_pickle(aid2orgset, os.path.join(NEW_DATA_DIR, 'aid2orgset.pkl'))

    # ------------------------------------------
    # save format: aid2venueset--> {aid: set{venue_word, venue_word, ...}}
    aid2venueset = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_set = set()
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            venue = whole_author_profile_pub[pid].get('venue', '').lower()
            if venue == '':
                continue
            else:
                venue_set = set(venue.replace('-', ' ').split())
                inner_set = inner_set | venue_set
        aid2venueset[aid] = inner_set
    if mission == 'train':
        save_pickle(aid2venueset,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2venueset.pkl'))
    elif mission == 'test':
        save_pickle(aid2venueset, os.path.join(NEW_DATA_DIR,
                                               'aid2venueset.pkl'))

    # ------------------------------------------
    # save format: aid2keywordsset--> {aid: set{key_word, key_word, ...}}
    aid2keywordsset = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_set = set()
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            keywords = whole_author_profile_pub[pid].get('keywords', '')
            if len(keywords) == 0:
                continue
            for keyword in keywords:
                if keyword != '':
                    keyword_set = set(keyword.lower().replace('-',
                                                              ' ').split())
                    inner_set = inner_set | keyword_set
        aid2keywordsset[aid] = inner_set
    if mission == 'train':
        save_pickle(aid2keywordsset,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2keywordsset.pkl'))
    elif mission == 'test':
        save_pickle(aid2keywordsset,
                    os.path.join(NEW_DATA_DIR, 'aid2keywordsset.pkl'))

    # ------------------------------------------
    # save format: aid2orgwithyear --> {aid: [(org, year), () ...]}
    aid2orgwithyear = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        pids = aid2pids[aid]
        inner_list = []
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            auhtors = whole_author_profile_pub[pid]['authors']
            org = auhtors[int(index)].get('org', '').lower()
            year = whole_author_profile_pub[pid].get('year', '0')
            if year == '':
                year = 0
            else:
                year = int(year)
            if any([year < 1500, year > 2100]):
                year = 0
            inner_list.append((org, year))
        aid2orgwithyear[aid] = inner_list
    if mission == 'train':
        save_pickle(aid2orgwithyear,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2orgwithyear.pkl'))
    elif mission == 'test':
        save_pickle(aid2orgwithyear,
                    os.path.join(NEW_DATA_DIR, 'aid2orgwithyear.pkl'))

    # ------------------------------------------
    # save format aid2yearinfo --> {aid: {year: {
    #                                            orgs: [org, ....],
    #                                            venues: [venues, ...],
    #                                            keywords: [keyword, ...],
    #                                            coauthors: [author-name, ...],
    #                                            }}}
    aid2yearinfo = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        inner_dict = {}
        pids = aid2pids[aid]
        for pid_with_index in pids:
            pid, index = pid_with_index.split('-')
            year = whole_author_profile_pub[pid].get('year', '0')
            if year == '':
                year = 0
            else:
                year = int(year)
            if any([year < 1500, year > 2100]):
                year = 0
            authors = whole_author_profile_pub[pid]['authors']
            authors_name = [clean_name(item['name']) for item in authors]
            org = [authors[int(index)].get('org', '').lower()]
            authors_name.pop(int(index))
            coauthor = authors_name
            venue = [whole_author_profile_pub[pid].get('venue', '').lower()]
            keywords = whole_author_profile_pub[pid].get('keywords', [''])
            if len(keywords) == 0:
                keywords = ['']
            keywords = [keyword.lower() for keyword in keywords]
            tmp_dict = {
                'orgs': org,
                'venues': venue,
                'keywords': keywords,
                'coauthors': coauthor,
            }
            if year in inner_dict.keys():
                for key in tmp_dict:
                    inner_dict[year][key].extend(tmp_dict[key])
            else:
                inner_dict[year] = tmp_dict
        aid2yearinfo[aid] = inner_dict
    if mission == 'train':
        save_pickle(aid2yearinfo,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2yearinfo.pkl'))
    elif mission == 'test':
        save_pickle(aid2yearinfo, os.path.join(NEW_DATA_DIR,
                                               'aid2yearinfo.pkl'))

    texttovec = TextToVec()
    # ------------------------------------------
    # save format: aid2titlevec --> {aid: [mean value]}
    aid2titlevec = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        papers = aid2pids[aid]
        inner_list = []
        for pid_with_index in papers:
            pid, index = pid_with_index.split('-')
            title = whole_author_profile_pub[pid]['title']
            inner_list.append(texttovec.get_vec(title))
        if len(inner_list) == 0:
            aid2titlevec[aid] = np.zeros(300)
        else:
            aid2titlevec[aid] = np.mean(np.array(inner_list), axis=0)
    if mission == 'train':
        save_pickle(aid2titlevec,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2titlevec.pkl'))
    elif mission == 'test':
        save_pickle(aid2titlevec, os.path.join(NEW_DATA_DIR,
                                               'aid2titlevec.pkl'))

    # ------------------------------------------
    # save format: aid2abstractvec --> {aid: [mean value]}
    aid2abstractvec = {}
    for aid in tqdm.tqdm(aid2pids.keys()):
        papers = aid2pids[aid]
        inner_list = []
        for pid_with_index in papers:
            pid, index = pid_with_index.split('-')
            abstract = whole_author_profile_pub[pid].get('abstract')
            if abstract is None:
                continue
            inner_list.append(texttovec.get_vec(abstract))
        if len(inner_list) == 0:
            aid2abstractvec[aid] = np.zeros(300)
        else:
            aid2abstractvec[aid] = np.mean(np.array(inner_list), axis=0)
    if mission == 'train':
        save_pickle(aid2abstractvec,
                    os.path.join(NEW_DATA_V2_DIR, 'aid2abstractvec.pkl'))
    elif mission == 'test':
        save_pickle(aid2abstractvec,
                    os.path.join(NEW_DATA_DIR, 'aid2abstractvec.pkl'))
コード例 #30
0
 def get_pid_with_index(whole_author_profile_pub, pid, name):
     authors = whole_author_profile_pub[pid]['authors']
     authors_names = [clean_name(item['name']) for item in authors]
     index = get_name_index(name, authors_names)
     return '%s-%d' % (pid, index)
コード例 #31
0
def get_all_trades(event, context):

    STABLE_COINS = ['USDT', 'USDC', 'PAX', 'BUSD', 'TUSD']
    # Retrive traded pairs from events
    json_payload = json.loads(event['body'])['data']
    binance_api_key = json_payload['API_KEY']
    binance_api_secret = json_payload['API_SECRET']
    traded_pairs = json_payload['TRADED_PAIRS']

    binance_client = Client(api_key=binance_api_key,
                            api_secret=binance_api_secret)

    # Get latest prices of each coin from s3 bucket
    bucket_name = environ.get("BUCKET")
    coin_file_names = environ.get("COIN_FILE_NAMES")
    client = boto3.client("s3")
    file = client.get_object(Bucket=bucket_name, Key=coin_file_names)
    coins_df = pd.read_csv(file["Body"], compression='gzip')
    prices = coins_df.set_index('symbol').to_dict()['price']

    # Step 1: Get quantities of the coins
    order_history = []
    reverse_order = [
    ]  # Suppose you trade DOTBNB , create a reverse order for BNB to "sell"
    qty_dict = {}
    qty_dict['BTC'] = 0
    qty_dict['ETH'] = 0

    for idx, pair in enumerate(traded_pairs):
        if idx == 200:
            sleep(30)
        orders = binance_client.get_all_orders(symbol=pair)
        order_history.extend(orders)
        quantity = 0
        for order in orders:
            ex_qty = float(order['executedQty'])
            if order['side'] == 'BUY':
                quantity += ex_qty
            else:
                quantity -= ex_qty

            if ex_qty > 0:
                if pair.split('BTC')[-1] == "":
                    reverse_coin = 'BTC'
                elif pair.split('BNB')[-1] == "":
                    reverse_coin = 'BNB'
                elif pair.split('ETH')[-1] == "":
                    reverse_coin = 'ETH'
                else:
                    reverse_coin = False

                if reverse_coin:
                    order_copy = order.copy()
                    order_copy['time'] += 1
                    order_copy['updateTime'] += 1
                    order_copy['symbol'] = reverse_coin + "USDT"
                    order_copy['executedQty'] = float(
                        order_copy['price']) * float(order_copy['executedQty'])
                    order_copy['side'] = "SELL"
                    # This is for computing coin profit and loss
                    order_copy['type'] = "REVERSE"
                    reverse_order.append(order_copy)

        if orders:
            clean_coin_name = clean_name(pair)
            if clean_coin_name in qty_dict:
                qty_dict[clean_coin_name] += quantity
            else:
                qty_dict[clean_coin_name] = quantity

    # Step 1 Part 2: Resolve the reverse orders - this is very ugly code, but it works for now
    order_history.extend(reverse_order)
    for order in reverse_order:
        ex_qty = float(order['executedQty'])
        pair = order['symbol']
        clean_coin_name = clean_name(pair)

        if order['side'] == 'BUY':
            qty_dict[clean_coin_name] += ex_qty
        else:
            qty_dict[clean_coin_name] -= ex_qty

    # Step 2: Get current prices of coins in possession
    price_dict = {}
    for clean_coin_name in list(qty_dict.keys()):
        price_dict[clean_coin_name] = prices[clean_coin_name + "USDT"]

    # Step 3: Construct assets dictionary
    assets = {}
    for k in qty_dict:
        assets[k] = [qty_dict[k], price_dict[k]]

    # Step 4: Get any stable coins position
    for stable_coin in STABLE_COINS:
        stable_coin_obj = binance_client.get_asset_balance(stable_coin)
        stable_coin_position = float(stable_coin_obj['free']) + float(
            stable_coin_obj['locked'])
        assets["USDT"] = [stable_coin_position, 1]

    frontend_formatted_assets = []
    for coin in assets:
        temp = {}
        temp['Coin'] = coin
        temp['Quantity'] = assets[coin][0]
        temp['Price'] = assets[coin][1]
        temp['Value'] = round(
            float(temp['Price']) * float(temp['Quantity']), 3)
        if temp['Value'] > 5.0:
            frontend_formatted_assets.append(temp)

    response = {
        "statusCode":
        "200",
        "headers": {
            'Access-Control-Allow-Origin': "*",
            'Access-Control-Allow-Credentials': True,
        },
        "body":
        json.dumps({
            "assets": frontend_formatted_assets,
            "order_history": order_history,
        })
    }

    return response
コード例 #32
0
def gen_title_feature():
    aid2titlevec = load_pickle(os.path.join(NEW_DATA_DIR, 'aid2titlevec.pkl'))

    test_unass = load_json(TEST_UNASS_PATH)
    test_pub = load_json(TEST_PUB_PATH)
    name2aids = load_pickle(os.path.join(NEW_DATA_DIR, 'name2aids.pkl'))
    texttovec = TextToVec()

    name_map = load_json(
        os.path.join(FINAL_DIR, 'name.different.modified.json'))
    original_name = [pair[0] for pair in name_map]
    changed_name = [pair[1] for pair in name_map]
    name_map2 = load_json(
        os.path.join(FINAL_DIR, 'name.different.2.modified.json'))
    original_name2 = [pair[0] for pair in name_map2]
    changed_name2 = [pair[1] for pair in name_map2]

    all_authors_name = list(name2aids.keys())
    # test_title_feature --> {pid-with-index: {candidate-aids: [...], data: [(emb0, meb1), ...]}}
    test_title_feature = {}
    print('Gen title emb pair ...')
    for pid_with_index in tqdm.tqdm(test_unass):
        inner_dict = {}
        now_pid, index = pid_with_index.split('-')
        author_name = test_pub[now_pid]['authors'][int(index)]['name']
        author_name = clean_name(author_name)
        if pid_with_index == 'ToCcabLT-1':
            author_name = 'junliang_wang'
        if pid_with_index == 'cVvvcFzj-1':
            author_name = 'xiaojun_liu'

        if author_name in original_name2:
            name_index = original_name2.index(author_name)
            author_name = changed_name2[name_index]
        elif author_name in original_name:
            name_index = original_name.index(author_name)
            author_name = changed_name[name_index]
        else:
            index = get_name_index(author_name, all_authors_name)
            author_name = all_authors_name[index]

        if isinstance(author_name, str):
            candidate_aids = name2aids[author_name]
        elif isinstance(author_name, list):
            candidate_aids = []
            for name in author_name:
                candidate_aids.extend(name2aids[name].tolist())
            candidate_aids = np.array(candidate_aids)
        else:
            raise ValueError("check author name !!!")

        inner_dict['candidate-aids'] = candidate_aids
        info = test_pub[now_pid].get('title')
        if info is None:
            emb = np.zeros(300)
        else:
            emb = texttovec.get_vec(info)
        data = []
        for aid in candidate_aids:
            emb_pair = (aid2titlevec[aid], emb)
            data.append(emb_pair)
        inner_dict['data'] = data
        test_title_feature[pid_with_index] = inner_dict
    save_pickle(
        test_title_feature,
        os.path.join(TEST_FEATURE_DIR, 'test-title-emb-pair-name-clean-2.pkl'))

    print('Gen title distance ...')
    test_title_emb_pair = load_pickle(
        os.path.join(TEST_FEATURE_DIR, 'test-title-emb-pair-name-clean-2.pkl'))
    test_unass = load_json(TEST_UNASS_PATH)
    title_emb_pair = []
    for pid_with_index in tqdm.tqdm(test_unass):
        for pair in test_title_emb_pair[pid_with_index]['data']:
            title_emb_pair.append(pair)
    emb_pair_to_distance(
        'tm.title.1.checkpoint.pth', 'title', title_emb_pair,
        os.path.join(TEST_FEATURE_DIR,
                     'test-title-distance-df-name-clean-2.pkl'))
コード例 #33
0
    def retrieve_plot_data(self):
        '''retrieve default data from spec data file'''
        '''
        data parser for 2-D mesh and hklmesh
        '''
        label1, start1, end1, intervals1, label2, start2, end2, intervals2, time = self.scan.scanCmd.split(
        )[1:]
        if label1 not in self.scan.data:
            label1 = self.scan.L[0]  # mnemonic v. name
        if label2 not in self.scan.data:
            label2 = self.scan.L[1]  # mnemonic v. name
        axis1 = self.scan.data.get(label1)
        axis2 = self.scan.data.get(label2)
        intervals1, intervals2 = map(int, (intervals1, intervals2))
        start1, end1, start2, end2, time = map(
            float, (start1, end1, start2, end2, time))

        if len(axis1) < intervals1 and min(axis2) == max(axis2):
            # stopped scan before second row started, 1-D plot is better (issue #82)
            self.axes = [
                label1,
            ]
            self.signal = self.scan.column_last
            self.data[label1] = self.scan.data[label1]
            self.data[self.signal] = self.scan.data[self.signal]
            return

        axis1 = axis1[0:intervals1 + 1]
        self.data[label1] = axis1  # 1-D array

        axis2 = [
            axis2[row] for row in range(len(axis2))
            if row % (intervals1 + 1) == 0
        ]
        self.data[label2] = axis2  # 1-D array

        column_labels = self.scan.L
        column_labels.remove(label1)  # special handling
        column_labels.remove(label2)  # special handling
        if self.scan.scanCmd.startswith('hkl'):
            # find the reciprocal space axis held constant
            label3 = [key for key in ('H', 'K', 'L')
                      if key in column_labels][0]
            self.data[label3] = self.scan.data.get(label3)[0]  # constant

        # build 2-D data objects (do not build label1, label2, [or label3] as 2-D objects)
        data_shape = [len(axis2), len(axis1)]
        for label in column_labels:
            if label not in self.data:
                axis = numpy.array(self.scan.data.get(label))
                self.data[label] = utils.reshape_data(axis, data_shape)
            else:
                pass

        self.signal = utils.clean_name(self.scan.column_last)
        self.axes = [label1, label2]

        if spec.MCA_DATA_KEY in self.scan.data:  # 3-D array(s)
            # save each spectrum
            for key, spectrum in sorted(
                    self.scan.data[spec.MCA_DATA_KEY].items()):
                num_channels = len(spectrum[0])
                data_shape.append(num_channels)
                mca = numpy.array(spectrum)
                data = utils.reshape_data(mca, data_shape)
                channels = range(1, num_channels + 1)
                ds_name = '_' + key + '_'
                self.data[ds_name] = data
                self.data[ds_name + 'channel_'] = channels