Exemple #1
0
    def create_h5(self, split_num, iter_num, input_sequence, DEL_TRAIN_WAV):

        cpu_cores = int(split_num / iter_num)
        tmp1 = []
        tmp2 = []
        tmp3 = []
        # noisy_dir = join(noisy_dir, 'train')
        training_data_list = search_wav(self.noisy_dir)
        print('Total training files: ', len(training_data_list))

        for file in training_data_list:
            try:
                snr, noise_name, clean_name1, clean_neme2 = file.split(
                    '/')[-1].split('_')
                clean_file = join(
                    self.noisy_dir,
                    '_'.join(['0ms', 'n0', clean_name1, clean_neme2]))
                noisy_file = file
            except:
                snr, noise_name, clean_name = file.split('/')[-1].split('_')
                clean_file = join(self.noisy_dir,
                                  '_'.join(['0ms', 'n0', clean_name]))
                noisy_file = file

            tmp1.append(clean_file)
            tmp2.append(noisy_file)

        training_num = 30000
        t1, t2 = shuffle(np.array(tmp1), np.array(tmp2))
        t1 = t1[:training_num]
        t2 = t2[:training_num]

        clean_split_list = split_list(t1, wanted_parts=split_num)
        noisy_split_list = split_list(t2, wanted_parts=split_num)

        start = 0
        end = cpu_cores
        for num in range(iter_num):
            print(start, end)
            pool = Pool(cpu_cores)
            func = partial(_create_split_h5, clean_split_list,
                           noisy_split_list, self.save_h5_dir,
                           self.save_h5_name, input_sequence)
            pool.map(func, range(start, end))
            pool.close()
            pool.join()
            start = end
            end += cpu_cores
        if DEL_TRAIN_WAV:
            shutil.rmtree(self.noisy_dir)
Exemple #2
0
def remove_list(update: Update, context: CallbackContext):
    lists = db.get_lists()
    buttons = [l.get("name") for l in lists]
    reply_markup = ReplyKeyboardMarkup(split_list(buttons, 3))
    update.message.reply_text("Choose list:", reply_markup=reply_markup)

    return conversations["remove_list"]["choose_list"]
def improved_create_users_from_ids(user_ids):
    """
    create_users_from_idsの改善版
    時間が100分の1になったやつ(プロフィールのクエリだけまとめてIDを飛ばせることに気づいた)
    """
    users = []
    user_ids_list = utils.split_list(user_ids, 100)
    for index, ids in enumerate(user_ids_list):
        utils.print_step_log("CreateUsersList", index, len(user_ids_list))
        try:
            profs = tg.get_user_profiles(ids)
        except:
            traceback.print_exc()
            sleep(1)
            continue

        if profs == None or profs == []:
            continue

        for prof in profs:
            user = User(id=prof['id'],
                        name=prof['name'],
                        description=prof['description'],
                        friends_count=prof['friends_count'],
                        created_at=dt.strptime(prof['created_at'],
                                               "%a %b %d %H:%M:%S +0000 %Y"),
                        is_protected=prof['protected'])
            users.append(user)
        sleep(1)

    return users
Exemple #4
0
    async def members(self, ctx: utils.CustomContext, *, role: Role):
        """Check the list of members in a certain role.
        Permissions needed: `Manage Messages`"""

        in_role = []
        [
            in_role.append(f"{member.mention} ({member})")
            for member in role.members
        ]
        columns = [in_role, ["\u200b"]]
        if len(in_role) > 1:
            columns[0], columns[1] = utils.split_list(in_role)
            columns.sort(reverse=True)

        if len("\n".join(columns[0])) > 1024:
            columns[0] = columns[0][:20]

        if len("\n".join(columns[1])) > 1024:
            columns[1] = columns[1][:20]

        embed = self.bot.embed(
            ctx,
            title=f"Members in {role.name} [{sum(1 for m in role.members)}]")
        [
            embed.add_field(name="\u200b",
                            value="\n".join(column) if column else "\u200b")
            for column in columns
        ]

        await ctx.send(embed=embed)
def replace_boilerplate(message, related, region_name):
    proposals, addresses = split_list(lambda x: isinstance(x, Proposal),
                                      related)

    return template_replace(message, {
        "region": region_name,
        "proposals": "\n<br/>".join(f"{p.address} ({p.case_number})"
                                    for p in proposals),
        "addresses": "\n<br/>".join(addr[0] for addr in addresses)
    }).replace("\n", "\n<br/>")
 def __init__(self,
              formula_path,
              dict_path,
              separate_conj_stmt=False,
              binary=False,
              part_no=-1,
              part_total=0,
              file_list=None,
              deepmath=False,
              norename=False,
              filter_abelian=False,
              compatible=False):  # part_no, part_total: will not shuffle.
     self.formula_path = formula_path
     self.dict_path = dict_path
     self.maxsize = 500  # maxsize for async queue
     self.iter_ = 0  # epoch. Legacy reason for its name
     self.total_in_epoch = -1  # conj, stmt pairs supply in current epoch.
     self.total_iter = -1  # total iteration
     self.rename = not norename
     if not os.path.exists(dict_path):
         self.dict = self.build_dictionary()
     else:
         self.dict = torch.load(dict_path)
     self.queue = Queue(self.maxsize)
     self.reader = Process(target=self.read)
     self.dict_size = len(self.dict.keys())
     self.separate_conj_stmt = separate_conj_stmt
     self.binary = binary
     self.part_no = part_no
     self.part_total = part_total
     if file_list is None:
         file_list = os.listdir(self.formula_path)
         if part_total != 0:
             file_list.sort()
             file_list = split_list(file_list, part_total, part_no)
     else:
         if part_total != 0:
             file_list = split_list(file_list, part_total, part_no)
     self.file_list = file_list
     self.deepmath = deepmath
     self.filter_abelian = filter_abelian
     self.compatible = compatible
Exemple #7
0
def parse_cp_mv(args):
  """
    parse args like:
       4 5 6 7 to asd bsd csd
    return [[tagsd,asdf,vvv,sdf],[4,5,6,7]]
  """
  # here we split two arrays by <to> keyword
  args = u.sp_split(args)
  
  nodes,tags = u.split_list(args, 'to')
  return [nodes,u.everything_to_str(tags)]
    def _make_registers(self, n_chunks=7):
        self.lowest_note = self.range[0]
        self.highest_note = self.range[-1]

        registers = list(utils.split_list(self.range, n_chunks=n_chunks))

        self.middle_register = registers[3]  # assuming 7 divisions
        self.highest_register = registers[-1]
        self.lowest_register = registers[0]
        self.safe_register = utils.flatten(registers[1:-1])
        self.very_safe_register = utils.flatten(registers[2:-2])
Exemple #9
0
def split_bycol(mat, rate=0.7):

    row, col = mat.shape

    col_list = sum(mat)

    total = sum(col_list)
    threhold = total * rate / col

    col_list /= threhold
    _splits = split_list(col_list, 0)
    res_list = filter(lambda res: res[1] != [], _splits)
    return res_list
Exemple #10
0
def parallel_global_stiffness(mesh) -> GlobalStiffnessLil:
    N = len(mesh.nodes)
    shape = (2 * N, 2 * N)
    with Pool(N_PROCESSES) as pool:
        all_args = zip(split_list(list(mesh.elements.values()), N_PROCESSES),
                       repeat(shape))
        results = pool.starmap(GlobalStiffnessCoo.from_elements, all_args)
    K = sparse.csr_matrix(shape)
    for result in results:
        K += result
    K = K.tolil()
    #    print('conversion done')
    return K
Exemple #11
0
def load_frame_datas(data_path):
    data_path_list = glob.glob("%s/data_json/frame_*.json" % data_path)
    frame_data_list = []
    number_of_peoples_in_frames = []
    for i in range(len(data_path_list)):
        file_path = join(data_path, 'data_json', 'frame_%d.json' % i)
        with open(file_path, 'r') as f:
            pose_result = json.loads(f.read())
            peoples = map(lambda people: utils.split_list(people['pose_keypoints'], 3), pose_result['people'])
            number_of_peoples_in_frames.append(len(peoples))
            frame_data_list.append(peoples)

    return frame_data_list, number_of_peoples_in_frames
Exemple #12
0
def split_bycol(mat,rate=0.7):

    row,col = mat.shape

    col_list = sum(mat)

    total = sum(col_list)
    threhold = total*rate/col

    col_list /= threhold
    _splits = split_list(col_list,0)
    res_list = filter(lambda res: res[1] != [],_splits)
    return res_list
Exemple #13
0
    def parse(full_input: List[Word]) -> Optional[ParseResult]:
        inputs = split_list(full_input, separators)

        if len(inputs) <= 1:
            # There were no occurrences of the separators.
            return FailureParse()

        results = [single_action().parse(words) for words in inputs]
        filtered = [result for result in results
                    if result.is_success()]  # Ignore partials
        actions = [r.parsed for r in filtered]

        return SuccessParse(Composite(actions), 1.0, [])
Exemple #14
0
async def index(request):

    points = Point.select()

    init_point = points[0] if points else default_point_factory()

    lines = [line for line in split_list(points, lambda p: p.is_newline)]

    return {
        "points": points,
        "token": yandex,
        "init_point": init_point,
        "lines": lines
    }
def replace_boilerplate(message, related, region_name):
    proposals, addresses = split_list(lambda x: isinstance(x, Proposal),
                                      related)

    return template_replace(
        message, {
            "region":
            region_name,
            "proposals":
            "\n<br/>".join(f"{p.address} ({p.case_number})"
                           for p in proposals),
            "addresses":
            "\n<br/>".join(addr[0] for addr in addresses)
        }).replace("\n", "\n<br/>")
def multi_process_generate(load_dir, save_dir, savename):
    """"
    Extracts datapoints from all .json files in train_dir and saves the them in a new .csv file
    :param load_dir: The directory to load from
    :param save_dir: The directory to save the extracted headers
    :param savename: The filename to save
    :param num_headers: The amount of headers to use as datapoint
    """

    csvfiles = []

    for csv in glob.iglob(save_dir + '*.csv'):
        csvfiles.append(os.path.basename(csv))
    # Load all files
    files = []
    for fullname in glob.iglob(load_dir + '*.json'):
        filename = os.path.basename(fullname)
        csvname = filename.split('.')[0] + '.csv'
        if csvname in csvfiles:
            os.rename(fullname, load_dir + 'processed_json/' + filename)
        else:
            files.append(fullname)

    manager = multiprocessing.Manager()
    dataframes = manager.list()
    filelist = glob.glob(load_dir + '*.json')
    splits_count = multiprocessing.cpu_count()
    filesplits = split_list(filelist, splits_count)

    threads = []
    for split in filesplits:
        # create a thread for each
        t = multiprocessing.Process(target=save_parse_result,
                                    args=(split, dataframes))
        threads.append(t)
        t.start()
        print(t.name + 'starting')
    # create one large dataframe

    for t in threads:
        t.join()
        print("Process joined: ", t)

    data = pd.concat(dataframes)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    data.to_csv(save_dir + savename, mode='w')
Exemple #17
0
    def parallel_calculate_array_values(self, U):
        for i in range(len(self.nodes)):
            self.nodes[i].values['displacement'] = np.array(
                [U[2 * i], U[2 * i + 1]])

        with Pool(N_PROCESSES) as pool:
            element_groups = split_list(list(self.elements.values()),
                                        N_PROCESSES)
            strain_and_stress_arrays = pool.map(get_arrays, element_groups)

            for i, (strain_array,
                    stress_array) in enumerate(strain_and_stress_arrays):
                group = element_groups[i]
                for j, element in enumerate(group):
                    element.values['strain'] = strain_array[j]
                    element.values['stress'] = stress_array[j]
    def _custom_kcrossvalidation(self, x, y):
        '''
        Function creates cross-validation folds for x and y inputs.
        :param x: 3d input array: ntrials x ntimepoints x nfeatures
        :param y: vector of labels: ntrials,
        :param k: number of splits of the data
        :param n_val: number of trials in validation set
        :return: Train, Val and Test lists of nfolds length, each item is a tuple of x and y data
        '''

        n = x.shape[0]
        shape = x.shape[1:]
        x = x.reshape(n, -1)

        if self.shuffle: x, t = self._shuffle_data(x, y)

        l = range(n)
        test_folds = utils.split_list(l, self.k)
        Train, Val, Test = [], [], []

        for i, t in enumerate(test_folds):
            xc = x.copy()
            yc = y.copy()
            Test.append([
                xc[t].reshape([-1] + [shape[s] for s in range(len(shape))]),
                y[t]
            ])

            xc = np.delete(xc, t, axis=0)
            yc = np.delete(yc, t, axis=0)

            xc = np.roll(xc, -len(t) * i, axis=0)
            yc = np.roll(yc, -len(t) * i, axis=0)

            Val.append([
                xc[:len(t)].reshape([-1] +
                                    [shape[s] for s in range(len(shape))]),
                yc[:len(t)]
            ])
            Train.append([
                xc[len(t):].reshape([-1] +
                                    [shape[s] for s in range(len(shape))]),
                yc[len(t):]
            ])

        return Train, Test, Val
Exemple #19
0
def process_resources(threads):
    """Download all the unprocessed resources
    """
    class Downloader(Thread):
        def __init__(self, resources):
            self.resources = resources
            super(Downloader, self).__init__()

        def run(self):
            boilerpipe.jpype.attachThreadToJVM()
            for res in self.resources:
                try:
                    content = download(res.url)
                    content = boilerpipe.transform(content)
                except:
                    content = ""

                if content and len(content) >= 200:
                    res.textual = True

                # we don't want documents of less that 25 chars
                if not content:
                    res.blacklisted = True
                    print "blacklisted %s" % res.url
                else:
                    res.content = content
                    print "downloaded %s" % res.url
                res.processed = True
                res.save()

    if threads > multiprocessing.cpu_count():
        threads = multiprocessing.cpu_count()

    # initialise the JVM
    boilerpipe.start_jvm()
    resources = list(db.resources.Resource.find({'processed': False}))

    print "download %s urls using %s threads" % (len(resources), threads)

    # split the resource into the number of threads
    resources = split_list(resources, threads)

    # start the threads and pass them the resources to be processed
    for i in range(threads):
        d = Downloader(resources[i])
        d.start()
Exemple #20
0
def process_resources(threads):
    """Download all the unprocessed resources
    """
    class Downloader(Thread):
        def __init__(self, resources):
            self.resources = resources
            super(Downloader, self).__init__()

        def run(self):
            boilerpipe.jpype.attachThreadToJVM()
            for res in self.resources:
                try:
                    content = download(res.url)
                    content = boilerpipe.transform(content)
                except:
                    content = ""

                if content and len(content) >= 200:
                    res.textual = True

                # we don't want documents of less that 25 chars
                if not content:
                    res.blacklisted = True
                    print "blacklisted %s" % res.url
                else:
                    res.content = content
                    print "downloaded %s" % res.url
                res.processed = True
                res.save()

    if threads > multiprocessing.cpu_count():
        threads = multiprocessing.cpu_count()

    # initialise the JVM
    boilerpipe.start_jvm()
    resources = list(db.resources.Resource.find({'processed': False}))

    print "download %s urls using %s threads" % (len(resources), threads)
    
    # split the resource into the number of threads
    resources = split_list(resources, threads)

    # start the threads and pass them the resources to be processed
    for i in range(threads):
        d = Downloader(resources[i])
        d.start()
Exemple #21
0
    def list_scrapers(self, update, context):
        """
            Lista os scrapers e abre um teclado interativo
        Args:
            update (Update): Objeto com os dados do chat e do usuário.
            context (CallbackContext): Objeto de contexto.
        """
        keyboard = [
            InlineKeyboardButton(
                text=scraper_name,
                callback_data=f"\\scraper_selected:{scraper_name}",
            ) for scraper_name in self.scrapers
        ]

        keyboard_splitted = utils.split_list(input_list=keyboard, size=2)

        reply_markup = InlineKeyboardMarkup(inline_keyboard=keyboard_splitted)
        update.message.reply_text("Concursos cadastrados no bot:",
                                  reply_markup=reply_markup)
def process_pcap_to_h5(read_dir, save_dir, session_threshold=5000):
    """
    Use this method to process all pcap files in a directory to a h5 format.
    Session threshold is used to filter out all sessions containing fewer packets
    :param save_dir:
    :param read_dir: Directory containing pcap files that should be converted into h5 format
    :param session_threshold: Threshold to filter out session with less packets
    :return: None
    """
    h5files = []

    for h5 in glob.iglob(save_dir + '*.h5'):
        h5files.append(os.path.basename(h5))
    # Load all files
    files = []
    for fullname in glob.iglob(read_dir + '*.pcap'):
        filename = os.path.basename(fullname)
        h5name = filename +'.h5'
        if h5name in h5files:
            os.rename(fullname, read_dir + '/processed_pcap/' + filename)
        else:
            files.append(fullname)

    splits = 4
    files_splits = split_list(files, splits)
    processes = []
    for file_split in files_splits:
        # create a thread for each
        t1 = multiprocessing.Process(target=save_pcap_task, args=(file_split, save_dir, session_threshold))
        print("Starting process", t1)
        processes.append(t1)
        t1.start()

    for process in processes:
        process.join()
        print("Process joined", process)
def answer_xval_lr(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, _ = read_csv_data.read_csv_data()
        train = list(train_reader)
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)
        
        pickle.dump((trainx, testx, fs, fv, analyzer, feat),
                open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for trainx and testx:")
        trainx, testx, fs, fv, analyzer, feat = \
                pickle.load(open('../data/xval_feat_strings.pkl', 'rb'))
    
    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    #X_scaler = sklearn.preprocessing.StandardScaler(with_mean=False, with_std=True)
    #X = X_scaler.fit_transform(X)

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=150)
    tm_analyzer = topic_model.Analyzer()
    tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict)
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    #topics_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True)
    #topics = topics_scaler.fit_transform(topics)

    # compute similarity for each question and each answer (of 4)
    # use this as X (e.g. NLP similarity, LDA similarity)
    # binary classification with LR (i.e. is the answer right or not)
    
    print ("Evaluating train data:")
    X_lr_train, y_lr_train = compute_scores(trainx, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, train=True,\
            print_info=True)
    print ("Training LR")
    # standardizing
    lr_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True)
    X_lr_train = lr_scaler.fit_transform(X_lr_train)

    # alpha sets the weight on regularization term
    lr = sklearn.linear_model.SGDClassifier(loss='log', penalty='l2',\
            n_iter=100, shuffle=True, fit_intercept=True, class_weight={0:.1, 1:.9})
    lr.fit(X_lr_train, y_lr_train)
    #lr.coef_[0,0] = 0.75
    #lr.coef_[0,1] = 0.25
    #lr.intercept_[0] = 0.0
    print (lr.coef_)
    print (lr.intercept_)
    our_answers = lr_make_predictions(X_lr_train, lr)
    acc_trainx = compute_accuracy(trainx, our_answers)
    print ("Train accuracy = {}\n".format(acc_trainx))

    print ("Evaluating test data:")
    X_lr_test = compute_scores(testx, X, fv,\
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    X_lr_test = lr_scaler.transform(X_lr_test)
    our_answers = lr_make_predictions(X_lr_test, lr)
    acc_testx = compute_accuracy(testx, our_answers)
    print ("Test accuracy = {}\n".format(acc_testx))
Exemple #24
0
def ipheadertask(filelist):
    j = 1
    for fullname in filelist:
        print("Loading filenr: {}".format(j))
        load_dir, filename = os.path.split(fullname)
        df = utils.load_h5(load_dir, filename)
        frames = df['bytes'].values
        for i, frame in enumerate(frames):
            p = np.fromstring(frame, dtype=np.uint8)
            if p[14] != 69:
                print("IP Header length not 20! in file {0}".format(filename))
        j += 1


if __name__ == '__main__':
    filelist = glob.glob(load_dir + '*.h5')
    filesplits = utils.split_list(filelist, 4)

    threads = []
    for split in filesplits:
        # create a thread for each
        t = multiprocessing.Process(target=ipheadertask, args=(split, ))
        threads.append(t)
        t.start()
    # create one large dataframe

    for t in threads:
        t.join()
        print("Process joined: ", t)
Exemple #25
0
from utils import csv2list, split_list, list2csv

all_path = '/data1/sap/frcnn_keras/data/mv_test_backup.txt'
l1_path = '/data1/sap/frcnn_keras/data/mv_val.txt'
l2_path = '/data1/sap/frcnn_keras/data/mv_test.txt'
ratio = .5

all_list = csv2list(all_path)
size = int(len(all_list) * ratio)
l1, l2 = split_list(all_list, size)

l1 = sorted(l1)
l2 = sorted(l2)

list2csv(l1_path, l1)
list2csv(l2_path, l2)
 def geocode_addresses(self, addresses):
     addresses = list(filter(None, map(str.strip, addresses)))
     geocoded = geocode_tuples(addresses,
                               region=self.cleaned_data["region"])
     return split_list(tuple.__instancecheck__, geocoded)
def answer_xval(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    train_pos = pickle.load(open('pos/train_pos.pkl', 'rb'))
    test_pos = pickle.load(open('pos/test_pos.pkl', 'rb'))
    all_pos = train_pos + test_pos

    example_d = pickle.load(open('pos/sim_prob_dict.pkl', 'rb'))

    row_num = 0
    old_ans = []
    with open('pos/our_answers.csv', 'rb') as csvfile:
         ans_reader = csv.reader(csvfile, delimiter=',')
         for row in ans_reader:
            if row_num > 0:
                old_ans.append({'id':row[0],'correctAnswer':row[1]})
            row_num += 1

    if not args.load:
        train_reader, test_reader = read_csv_data.read_csv_data()
        train = list(train_reader)
        test = list(test_reader)
        all_data = train + test
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        #analyzer = similarity.Analyzer()
        #feat = similarity.Featurizer(analyzer, pages_dict)
        
        #print ("Computing feature strings:")
        #fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)

#####################################
        #use_data = train
        #use_pos = train_pos
        use_data = all_data
        use_pos = all_pos

        ind = 0
        num_this = 0
        ans_types = {}
        num_q = 0
        old_relevant = []
        #for kk in trainx:
        for kk in use_data:
            for kk_pos in use_pos:
            #for kk_pos in train_pos:
                if kk_pos['id'] == kk['id']:
                    break

            #for kk_old in old_ans:
            #    if kk_old['id'] == kk['id']:
            #        break
            #ans_types.append(question_features2(kk))
            #ans_types.append(question_features2(kk_pos))
            [k,t] = question_features2(kk_pos)
            if k != 0:
                ans_types[k] = t
                num_q += 1
            #old_relevant.append(kk_old)
            ind += 1
            sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/float(len(use_data))) )
            sys.stdout.flush()

        num_empty = 0
        for ans in ans_types:
            if not(ans):
                num_empty += 1

        pred_list = {}
        ind = 0
        max_ind = len(use_data)
        for kk in range(0,len(use_data)):
            #if ind > max_ind:
                #break
            if use_data[kk]['id'] in ans_types.keys():
                ind += 1
                pred_list[use_data[kk]['id']] = answer_question(use_data[kk], \
                    ans_types[use_data[kk]['id']], pages_dict)
            else:
                ind += 1
                pred_list[use_data[kk]['id']] = []
            sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/max_ind) )
            sys.stdout.flush()        


        '''
        for kk in range(0,len(ans_types)):
            if ind > max_ind:
                break
            if (ans_types[kk]):
                ind += 1
                #pred_list.append(google_ans(trainx[kk], ans_types[kk]))
                #pred_list.append(answer_question(trainx[kk], ans_types[kk], pages_dict))
                pred_list.append(answer_question(use_data[kk], ans_types[kk], pages_dict))

            else:
                ind += 1
                pred_list.append([])
            sys.stdout.write("Parse Progress: %f%%   \r" % (ind*100/max_ind) )
            sys.stdout.flush()        '''    

        corr = 0
        total = 0
        for p in range(0,len(train)):
            q_key = train[p]['id']
            if q_key in pred_list.keys():
                if pred_list[q_key]:
                    if pred_list[q_key] == train[p]['correctAnswer']:
                    #if pred_list[p] == old_relevant[p]['correctAnswer']:
                        corr += 1
                    total +=1

        print ('Performance: ' + str(corr/float(total)))
        print ('Fraction Answered: ' + str(float(total)/float(len(use_data))))

        final_answers = pickle_ans(pred_list, use_data)

        pdb.set_trace()

        filepath = 'pos/metric_dict_10_90.pkl'
        pickle.dump(final_answers,open(filepath, 'wb'))
# optimizer = BertAdam(warmup=0.05, t_total=len(train_dataloader))
model.zero_grad()

model.to(device)
for epoch in range(epoch):
    model.train()
    train_loss = 0
    for label, query, l_query, pos, candidate_abstract, l_abstract,\
        candidate_labels, l_labels, candidate_type, candidate_abstract_numwords,\
            candidate_numattrs in tqdm(train_dataloader):
        if label.size()[0] == 1: continue

        #print(len(label))
        n_split = 100
        #if len(label > n_split):
        query_sp = split_list(query, n=n_split)
        l_query_sp = split_list(l_query, n=n_split)
        pos_sp = split_list(pos, n=n_split)
        candidate_abstract_sp = split_list(candidate_abstract, n=n_split)
        l_abstract_sp = split_list(l_abstract, n_split)
        candidate_labels_sp = split_list(candidate_labels, n_split)
        l_labels_sp = split_list(l_labels, n_split)
        candidate_type_sp = split_list(candidate_type, n_split)
        candidate_numattrs_sp = split_list(candidate_numattrs, n_split)
        candidate_abstract_numwords_sp = split_list(
            candidate_abstract_numwords, n_split)

        parts = len(query_sp)
        pred_set = []
        for i in range(parts):
            query = query_sp[i]
    def find_matching_proposals(self, region):
        proposals = self.cleaned_data["proposals"]

        return split_list(lambda p: p.region_name == region,
                          proposals)
    def find_matching_proposals(self, region):
        proposals = self.cleaned_data["proposals"]

        return split_list(lambda p: p.region_name == region, proposals)
Exemple #31
0
def test_split_list():
  x = ['hello','from','hell']
  assert u.split_list(x,'from')  == [['hello'],['hell']]
  assert u.split_list(x,'hello') == [[],['from','hell']]  
  assert u.split_list(x,'hell')  == [['hello','from'],[]]
    def library_chapter(self):
        """
        章节爬取动作
        :return:
        """
        start_url = self.get_chapter_url()
        try:
            self.driver.get(start_url)
            WebDriverWait(self.driver, 30).until(
                ec.visibility_of_element_located(
                    (By.XPATH,
                     '//div[@class="tree-head"]/span[@id="spanEdition"]')))
        except TimeoutException as e:
            self.sinOut.emit('超时!!! %s' % str(e))
            self.driver.get_screenshot_as_file('./error.png')
            return
        teaching = self.driver.find_element_by_xpath(
            '//div[@class="tree-head"]/span[@id="spanEdition"]').text
        level_name = self.driver.find_element_by_xpath(
            '//div[@class="tree-head"]/span[@id="spanGrade"]').text
        teaching = teaching.replace(':', '').replace(':', '')
        self.sinOut.emit('进行爬取章节!')
        if self.teaching_name != teaching or self.level_name != level_name:
            self.message_box.emit('警告', "没有数据!")
            return
        et = etree.HTML(self.driver.page_source)
        library_id = self.teaching
        sub_obj = et.xpath('//ul[@id="JYE_POINT_TREE_HOLDER"]/li')
        chapters_list = list()

        total = len(sub_obj)
        current_count = 0
        for item in sub_obj:
            lc_item = dict()
            lc_item['id'] = str(uuid.uuid1())
            pk = item.attrib.get('pk')
            nm = item.attrib.get('nm')
            child = utils.recursive_get_li(lc_item['id'], library_id, item)
            lc_item['pk'] = pk
            lc_item['parent_id'] = ''
            lc_item['library_id'] = library_id
            lc_item['name'] = nm
            lc_item['child'] = child
            chapters_list.append(lc_item)
            current_count += 1
            self.crawler_chapter_progress.emit(current_count, total)
        self.sinOut.emit('正在解析入库')

        if chapters_list:
            mutex.acquire()
            chapters = self.db_connect.session.query(
                LibraryChapter.name, LibraryChapter.id,
                LibraryChapter.pk).filter(
                    LibraryChapter.library_id == library_id)
            new_list = utils.split_list(chapters_list)
            if chapters.count() > 0:
                # 如果章节存在数据则进行更新
                relational_dict = dict()
                for item in chapters:
                    # new_list = self.update_chapter_pk_id(item.id, item.pk, new_list)
                    for item2 in new_list:
                        if item2.get('pk') == item.pk:
                            relational_dict[item2['id']] = item.id
                            item2['id'] = item.id
                            break
                    for item3 in new_list:
                        if item3.get('parent_id') and relational_dict.get(
                                item3['parent_id']):
                            item3['parent_id'] = relational_dict.get(
                                item3['parent_id'])
                chapters.delete()
                self.db_connect.session.commit()
            mutex.release()

            # 插入新值
            for item in new_list:
                mutex.acquire()
                if 'child' in item:
                    del item['child']
                self.db_connect.add(LibraryChapter(**item))
                mutex.release()
        self.sinOut.emit('章节爬取完成,重新加载查看')
def answer_xval(args):
    """
    Answer questions on a cross-validation dataset by doing the following:
        1. Extract (or load) feature strings for the training and test set
        2. Parse the feature strings to compute feature vectors.
        2. ???
        3. Profit

    Args:
        args: ArgumentParser arguments defined in __main__

    Returns:
        None
    """
    pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb'))
    
    if not args.load:
        train_reader, _ = read_csv_data.read_csv_data()
        train = list(train_reader)
        random.shuffle(train)

        # split train for X-val
        if args.limit > 0:
            train = train[0:args.limit]
        
        trainx, testx = utils.split_list(train, (args.split, 100.-args.split))
        print ("len(xval_train) = {}, len(xval_test) = {}"\
                .format(len(trainx), len(testx)))
 
        analyzer = similarity.Analyzer()
        feat = similarity.Featurizer(analyzer, pages_dict)
        
        print ("Computing feature strings:")
        fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True)
        
        pickle.dump((trainx, testx, fs, fv, analyzer, feat),
                open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

    elif args.load: # load pre-comuted feature strings
        print ("Loading precomputed feature strings for trainx and testx:")
        trainx, testx, fs, fv, analyzer, feat = \
                pickle.load(open('../data/xval_feat_strings.pkl', 'rb'))

    #XXX use this one instead   
    #feat = None
    #analyzer = similarity.Analyzer()
    #feat = similarity.Featurizer(analyzer, pages_dict)

    ## Here we do some cross-validation
    X = feat.compute_feats(fs)
    X = X.tocsr() # might already be CSR
    X.sort_indices() # needed for cosine-type measures

    # try some LDA stuff
    print ("Training LDA topic model")
    topic_mod = lda.LDA(n_topics=20, n_iter=150)
    tm_analyzer = topic_model.Analyzer()
    tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity
    tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict)
    topic_X = tm_feat.compute_feats(tm_fs)
    topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic

    print ("Evaluating train data:")
    acc_trainx = test_xval(trainx, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Train accuracy = {}\n".format(acc_trainx))

    print ("Evaluating test data:")
    acc_testx = test_xval(testx, X, fv,\
            #scorer=similarity.Scorer.cosine, print_info=True)
            scorer=similarity.Scorer.cosine, topics=topics, print_info=True)
    print ("Test accuracy = {}\n".format(acc_testx))
Exemple #34
0
    # Check FASTA
    with open(ofile, 'r') as of:
        count = 0
        for record in SeqIO.parse(of, 'fasta'):
            count += 1
        assert count == len(set(ids)), "FASTA ERROR: missing IDs: %s" % (cmd)
    return


if __name__ == "__main__":
    # Args
    ARGS = get_arg_parser().parse_args()

    # Tables
    ids = set()
    for tab in ARGS.tabs:
        tmp = pandas.read_csv(filepath_or_buffer=tab, sep='\t', header=0)
        assert ARGS.icol in list(
            tmp.columns), 'File %s has no column \"%s\"' % (tab, ARGS.icol)
        print('TAB %s: %d IDs' % (tab, tmp.shape[0]))
        ids.update(list(tmp[ARGS.icol]))
    print('IDS: %d' % len(ids))

    # Download
    pool = Pool(ARGS.cores)
    tmp = pool.starmap(
        download_sequences,
        enumerate([chunk for chunk in split_list(ids, ARGS.size)]))
    pool.close()
    pool.join()
Exemple #35
0
 def __init__(self, keypoint_vector):
     self.key_points = utils.split_list(keypoint_vector, 3)
            for file_name in file_names:
                scene_num = get_value_in_pattern(file_name, pattern)
                scene_num_set[cam_idx].add(scene_num)

        valid_scene_num = scene_num_set[cam_num[0]]
        for cam_idx in cam_num:
            valid_scene_num &= scene_num_set[cam_num[cam_idx]]

        all_scene_num = sorted(list(valid_scene_num))
        #print(all_scene_num)

        if (len(all_scene_num) != total):
            print('len(all_scene_num) != total', len(all_scene_num), '!=',
                  total)

        train_scene_num, test_scene_num = split_list(all_scene_num, num_train)
        val_scene_num, test_scene_num = split_list(test_scene_num, num_val)
        test_scene_num = test_scene_num[:num_test]
        for l in [train_scene_num, val_scene_num, test_scene_num]:
            l.sort()
        print(cnt,
              cls,
              len(train_scene_num),
              len(val_scene_num),
              len(test_scene_num),
              sep='\t')

        train_start_idx = num_cls[cls]
        val_start_idx = train_start_idx + len(train_scene_num)
        test_start_idx = val_start_idx + len(val_scene_num)
        num_cls[cls] = test_start_idx + len(test_scene_num)
Exemple #37
0
def cross_validate(path_to_df, text_field, label_field, n_folds=5, preprocessing_function=None,
                   additional_fields_and_preps={}, additional_data_paths=[], hyperparams={}, report_top_k=True,
                   log_dir="./", use_gpu=False, return_models=False, seed=17, verbose=False, remove_extra_labels=True):
    """

    :param path_to_df: str, path to csv or parquet file
    :param text_field: str, column of the dataframe in which is the text that should be classified
    :param label_field: str, column of the dataframe in which is the label of the corresponding text
    :param n_folds: int, number of folds
    :param preprocessing_function: function, function to apply on text_field column
    :param additional_fields_and_preps: dict. Dictionary in the following format
    {field_name1: preprocessing_function1, field_name2: preprocessing_function2} to enable custom preprocessing for
    different fields
    :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file
     :param hyperparams: dict, all hyperparams for train_supervised
    :param report_top_k: bool. If True will return top k scores, otherwise top 1 scores
    :param log_dir: str, directory to save the training files and the model
    :param use_gpu: bool, use gpu for training
    :param return_models: bool. If True will return tuple (scores, models)
    :param seed: int
    :param verbose: bool.
    :param remove_extra_labels: remove datapoints with labels which appear in additional_data_paths but not in
    train_data_path
    :return: list. The scores for each split
    """
    models, scores = [], []

    if path_to_df.endswith("parquet"):
        df = pd.read_parquet(path_to_df)
    else:
        df = pd.read_csv(path_to_df)

    for added_field, prep_f in additional_fields_and_preps.items():
        if df[added_field].dtype != "object":
            df[added_field] = df[added_field].astype(str)
        if prep_f:
            df[added_field] = df[added_field].map(prep_f)
        df[text_field] = df[text_field] + " " + df[added_field]

    for fold_number, val_mask in enumerate(split_list(len(df), n_folds, seed)):
        train_data_path, val_data_path = preprocess_and_save(df, val_mask, text_field, label_field,
                                                             preprocessing_function, additional_fields_and_preps,
                                                             "./tmp_txt/", "_split{}".format(fold_number), verbose, [])

        if verbose:
            print("train path {}".format(train_data_path))
            print("val path {}".format(val_data_path))

        hypers_new = hyperparams.copy()

        if additional_fields_and_preps:
            hypers_new["result_dir"] = os.path.join(log_dir, "{}_{}".format(hash_function(preprocessing_function),
                                                                            "_".join(
                                                                                additional_fields_and_preps.keys())))
        else:
            hypers_new["result_dir"] = os.path.join(log_dir, hash_function(preprocessing_function))
        hypers_new["use_gpu"] = int(use_gpu)
        hypers_new["split_and_train_params"] = {
            "df_path": path_to_df,
            "additional_fields_and_preps": additional_fields_and_preps, "remove_extra_labels": remove_extra_labels
        }

        model = train_supervised(train_data_path=train_data_path, val_data_path=val_data_path,
                                 additional_data_paths=additional_data_paths, hyperparams=hypers_new,
                                 preprocessing_function=preprocessing_function, remove_extra_labels=remove_extra_labels,
                                 log_dir=log_dir, use_gpu=use_gpu, verbose=verbose)

        if report_top_k:
            scores.append(model.top_k_accuracy)
        else:
            scores.append(model.top_1_accuracy)
        if return_models:
            models.append(model)
        del model
        gc.collect()
    if return_models:
        return scores, models
    return scores
 def geocode_addresses(self, addresses):
     addresses = list(filter(None, map(str.strip, addresses)))
     geocoded = geocode_tuples(addresses,
                               region=self.cleaned_data["region"])
     return split_list(tuple.__instancecheck__, geocoded)