Python preprocessの例、process.preprocess Pythonの例

コード例 #1

0

ファイルを表示

ファイル: worker.py プロジェクト: jpotterm/spc

def execute():
    app = request.forms['app']
    user = request.forms['user']
    cid = request.forms['cid']
    desc = request.forms['desc']
    np = request.forms['np']
    appmod = pickle.loads(request.forms['appmod'])
    # remove the appmod key
    del request.forms['appmod']
    appmod.write_params(request.forms, user)

    # if preprocess is set run the preprocessor
    try:
        if appmod.preprocess:
            run_params, _, _ = appmod.read_params(user, cid)
            base_dir = os.path.join(user_dir, user, app)
            process.preprocess(run_params, appmod.preprocess, base_dir)
        if appmod.preprocess == "terra.in":
            appmod.outfn = "out" + run_params['casenum'] + ".00"
    except:
        return template('error',
                        err="There was an error with the preprocessor")

    # submit job to queue
    try:
        priority = db(users.user == user).select(
            users.priority).first().priority
        uid = users(user=user).id
        jid = sched.qsub(app, cid, uid, np, priority, desc)
        return str(jid)
        #redirect("http://localhost:"+str(config.port)+"/case?app="+str(app)+"&cid="+str(cid)+"&jid="+str(jid))
    except OSError:
        return "ERROR: a problem occurred"

コード例 #2

0

ファイルを表示

ファイル: data.py プロジェクト: rowancheung/fetal

    def __init__(self,
                 input_files,
                 label_files=None,
                 batch_size=1,
                 label_types=None,
                 tile_inputs=False):
        self.input_files = input_files
        self.label_files = label_files
        self.inputs = np.array([preprocess(file, resize=True, tile=tile_inputs) for file in input_files])
        self.inputs = np.reshape(self.inputs, (-1,) + self.inputs.shape[2:])

        if concat_files is not None:
            concats = [[preprocess(file, resize=True, tile=tile_inputs) for file in channel] for channel in concat_files]
            concats = np.reshape(concats, (-1,) + self.inputs.shape[2:-1] + (len(concats),))
            self.inputs = np.concatenate((self.inputs, concats*))

        if label_files is not None:
            self.labels = np.array([preprocess(file, resize=True, tile=tile_inputs) for file in label_files])
            self.labels = np.reshape(self.labels, (-1,) + self.labels.shape[2:])
        else:
            self.labels = None

        self.batch_size = batch_size
        self.label_types = label_types
        self.tile_inputs = tile_inputs
        self.n = len(self.inputs)
        self.idx = 0

コード例 #3

0

ファイルを表示

ファイル: main.py プロジェクト: jelliy/Tianchi_ECG_Competition

def light_gbm_predict(do_not_pre_classes):
    from process import preprocess
    preprocess()

    x_test = pd.read_csv(r"../user_data/x_test.csv").iloc[:, 1:]
    x_test['age'].fillna(42.627019408001736, inplace=True)
    print(x_test)
    rows_number = x_test.iloc[:, 0].size
    clf_list = load_model("model.ml")

    pred_list = []

    name2idx, idx2name = read_class_name(config.arrythmia)

    for i in tqdm(range(34)):
        if i in do_not_pre_classes:
            pred_list.append(np.zeros(rows_number))
            print("skip .........")
            continue

        clf = clf_list[i]
        p_test = clf.predict(x_test)
        pred_list.append(p_test)

    return np.array(pred_list).T

コード例 #4

0

ファイルを表示

ファイル: data.py プロジェクト: dhinkris/placenta

    def __getitem__(self, idx):
        batch = []
        for file in self.inputs[self.batch_size * idx:self.batch_size * (idx + 1)]:
            if self.load_files:
                volume = file
            elif self.concat is None:
                volume = preprocess(file, self.funcs)
            else:
                volume = np.concatenate((preprocess(file, self.funcs), self.concat), axis=-1)
            batch.append(volume)
        batch = np.array(batch)

        if self.seeds is not None:
            seeds = []
            for file in self.seeds[self.batch_size * idx:self.batch_size * (idx + 1)]:
                seed = file if self.load_files else preprocess(file, ['resize'])
                seeds.append(seed)
            batch = np.concatenate((batch, np.array(seeds)), axis=-1)

        if self.seed_type is not None:
            if self.labels is None:
                raise ValueError('No labels to generate slices.')
            if self.seeds is not None:
                raise ValueError('Seeds already exist.')

            new_batch = np.zeros(tuple(list(batch.shape[:-1]) + [batch.shape[-1] + 1]))
            for i, file in enumerate(self.labels[self.batch_size * idx:self.batch_size * (idx + 1)]):
                label = file if self.load_files else preprocess(file, ['resize'])
                if self.seed_type == 'slice':
                    seed = np.zeros(batch[i].shape)
                    r = np.random.choice(label.shape[0])
                    while not np.any(label[r]):
                        r = np.random.choice(label.shape[0])
                    seed[r] = label[r]
                elif self.seed_type == 'volume':
                    seed = label.copy()
                new_batch[i] = np.concatenate((batch[i], seed), axis=-1)
            batch = new_batch

        if self.include_labels:
            if self.labels is None:
                raise ValueError('No labels provided.')

            labels = []
            for file in self.labels[self.batch_size * idx:self.batch_size * (idx + 1)]:
                label = file if self.load_files else preprocess(file, ['resize'])
                labels.append(label)
            labels = np.array(labels)
            batch = (batch, labels)
        
        return batch

コード例 #5

0

ファイルを表示

    def _get_batch(self, index_array):

        batch = []
        if self.label_types is None:
            for _, i in enumerate(index_array):
                if self.load_files:
                    x = self.inputs[i]
                elif self.tile_inputs:
                    x = preprocess(self.inputs[i], tile=self.tile_inputs)[i%8]
                elif self.random_gen:
                    s = self.samples[i]
                    n = np.random.choice(self.frames[s])
                    x = preprocess(_format(self.input_file_format, s, n), resize=self.resize, tile=self.tile_inputs)
                else:
                    x = preprocess(self.inputs[i], resize=self.resize)
                if self.augment:
                    x = self.image_transformer.random_transform(x, seed=self.seed)
                batch.append(x)
            return np.asarray(batch)

        labels = []
        for _, i in enumerate(index_array):
            if self.load_files:
                x = self.inputs[i]
                y = self.labels[i]
            elif self.tile_inputs:
                x = preprocess(self.inputs[i], tile=self.tile_inputs)[i%8]
                y = preprocess(self.labels[i], tile=self.tile_inputs)[i%8]
            elif self.random_gen:
                s = self.samples[i]
                n = np.random.choice(self.frames[s])
                x = preprocess(_format(self.input_file_format, s, n), resize=self.resize, tile=self.tile_inputs)
                y = preprocess(_format(self.label_file_format, s, n), resize=self.resize, tile=self.tile_inputs)
            else:
                x = preprocess(self.inputs[i], resize=self.resize)
                y = preprocess(self.labels[i], resize=self.resize)
            if self.augment:
                x, y = self.image_transformer.random_transform(x, y, seed=self.seed)
            batch.append(x)
            labels.append(y)

        all_labels = []
        for label_type in self.label_types:
            if label_type == 'label':
                if self.labels is None:
                    raise ValueError('Labels not provided.')
                all_labels.append(labels)
            elif label_type == 'input':
                all_labels.append(batch)
            else:
                raise ValueError(f'Label type {label_type} is not supported.')
        if len(all_labels) == 1:
            all_labels = all_labels[0]
        return (np.asarray(batch), np.asarray(all_labels))

コード例 #6

0

ファイルを表示

    def __init__(self,
                 input_files,
                 seed_files=None,
                 label_files=None,
                 batch_size=1,
                 seed_type=None,
                 crop_size=constants.SHAPE,
                 concat_files=None,
                 load_files=False,
                 include_labels=False,
                 rescale=True):
        self.inputs = input_files
        self.seeds = seed_files
        self.labels = label_files
        self.batch_size = batch_size
        self.seed_type = seed_type
        self.crop_size = crop_size
        self.concat = None
        self.load_files = load_files
        self.include_labels = include_labels
        self.rescale = rescale
        self.shape = shape(input_files[0])
        self.n = len(input_files)
        self.idx = 0

        if concat_files is not None:
            self.concat = np.concatenate(
                (preprocess(concat_files[0], resize=True, rescale=True),
                 preprocess(concat_files[1], resize=True)),
                axis=-1)

        if load_files:
            self.inputs = np.array([
                preprocess(file, resize=True, rescale=self.rescale)
                for file in input_files
            ])
            if self.concat is not None:
                new_inputs = []
                for vol in self.inputs:
                    new_inputs.append(
                        np.concatenate((vol, self.concat), axis=-1))
                self.inputs = np.array(new_inputs)
            if seed_files is not None:
                self.seeds = np.array(
                    [preprocess(file, resize=True) for file in seed_files])
            if label_files is not None:
                self.labels = np.array(
                    [preprocess(file, resize=True) for file in label_files])

コード例 #7

0

ファイルを表示

def test_factorized(input_file, model, ckpt_dir, scale, cube_size, min_num, postfix=''):
    # Pre-process
    cubes, cube_positions, points_numbers = preprocess(input_file, scale, cube_size, min_num)
    ### Encoding
    strings, min_v, max_v, shape = compress_factorized(cubes, model, ckpt_dir)
    # Write files
    filename = os.path.split(input_file)[-1][:-4]
    print(filename)
    rootdir = './compressed'+ postfix +'/'
    bytes_strings, bytes_pointnums, bytes_cubepos = write_binary_files_factorized(
        filename, strings.numpy(), points_numbers, cube_positions,
        min_v.numpy(), max_v.numpy(), shape.numpy(), rootdir)
    # Read files
    strings_d, points_numbers_d, cube_positions_d, min_v_d, max_v_d, shape_d = \
        read_binary_files_factorized(filename, rootdir)
    # Decoding
    cubes_d = decompress_factorized(strings_d, min_v_d, max_v_d, shape_d, model, ckpt_dir)

    # bpp
    N = get_points_number(input_file)
    bpp = round(8*(bytes_strings + bytes_pointnums + bytes_cubepos)/float(N), 4)
    bpp_strings = round(8*bytes_strings/float(N), 4)
    bpp_pointsnums = round(8*bytes_pointnums/float(N) ,4)
    bpp_cubepos = round(8*bytes_cubepos/float(N), 4)
    bpp_strings_hyper = 0
    bpp_strings_head = 0
    bpps = [bpp, bpp_strings, bpp_strings_hyper, bpp_strings_head, bpp_pointsnums, bpp_cubepos]

    return cubes_d, cube_positions_d, points_numbers_d, N, bpps

コード例 #8

0

ファイルを表示

def e2e(s):
    # make the input space-delimited in prefix notation
    ir = postprocess(infix_to_prefix(preprocess(s)))
    # split on space and turn into nested tuples
    tup = tuple_for_polish_expression(ir.split(' '))
    # convert to MRS and return
    return prettyUMRSForTuple(tup)

コード例 #9

0

ファイルを表示

ファイル: main.py プロジェクト: BibleEtScienceDiffusion/Mendel-Windows

def execute():
    global user
    check_user_var()
    app = request.forms.app
    cid = request.forms.cid
    np = request.forms.np
    desc = request.forms.desc
    #priority = request.forms.priority
    params = {}
    base_dir = os.path.join(myapps[app].user_dir,user,app,cid)

    # if preprocess is set run the preprocessor
    try:
        if myapps[app].preprocess:
            run_params,_,_ = myapps[app].read_params(user,cid)
            processed_inputs = process.preprocess(run_params,
                                       myapps[app].preprocess,base_dir)
        if myapps[app].preprocess == "terra.in":
            myapps[app].outfn = "out"+run_params['casenum']+".00"
    except:
        return template('error',err="There was an error with the preprocessor")

    # submit job to queue
    try:
        params['cid'] = cid
        params['app'] = app
        params['user'] = user
        priority = db(users.user==user).select(users.priority).first().priority
        jid = sched.qsub(app,cid,user,np,priority,desc)
        redirect("/case?app="+app+"&cid="+cid+"&jid="+jid)
    except OSError, e:
        print >>sys.stderr, "Execution failed:", e
        params = { 'cid': cid, 'output': pbuffer, 'app': app, 'user': user,
                   'err': e, 'apps': myapps.keys() }
        return template('error',params)

コード例 #10

0

ファイルを表示

ファイル: demo.py プロジェクト: rohitkuk/EmotionDetection

    def inference(self, img):
        img_info = {"id": 0}
        if isinstance(img, str):
            img_info["file_name"] = os.path.basename(img)
            img = cv2.imread(img)
            if img is None:
                raise ValueError("test image path is invalid!")
        else:
            img_info["file_name"] = None

        height, width = img.shape[:2]
        img_info["height"] = height
        img_info["width"] = width
        img_info["raw_img"] = img

        img, ratio = preprocess(img, self.test_size, self.rgb_means, self.std)
        img_info["ratio"] = ratio
        img = F.expand_dims(mge.tensor(img), 0)

        t0 = time.time()
        outputs = self.model(img)
        outputs = postprocess(outputs, self.num_classes, self.confthre,
                              self.nmsthre)
        logger.info("Infer time: {:.4f}s".format(time.time() - t0))
        return outputs, img_info

コード例 #11

0

ファイルを表示

    def __init__(self,
                 input_files,
                 label_files=None,
                 batch_size=1,
                 seed_type=None,
                 concat_files=None,
                 rotation_range=90.,
                 shift_range=0.1,
                 shear_range=0.1,
                 zoom_range=0.1,
                 crop_size=constants.SHAPE,
                 fill_mode='nearest',
                 cval=0.,
                 flip=True):
        self.inputs = np.array(
            [preprocess(file, rescale=True) for file in input_files])

        if label_files is not None:
            self.labels = np.array([preprocess(file) for file in label_files])
        else:
            self.labels = None

        self.seed_type = seed_type

        if concat_files is not None:
            concat = np.concatenate(
                (preprocess(concat_files[0]), preprocess(concat_files[1])),
                axis=-1)
            new_inputs = []
            for vol in self.inputs:
                new_inputs.append(np.concatenate((vol, concat), axis=-1))
            self.inputs = np.array(new_inputs)

        image_transformer = ImageTransformer(rotation_range=rotation_range,
                                             shift_range=shift_range,
                                             shear_range=shear_range,
                                             zoom_range=zoom_range,
                                             crop_size=crop_size,
                                             fill_mode=fill_mode,
                                             cval=cval,
                                             flip=flip)

        super().__init__(self.inputs,
                         self.labels,
                         image_transformer,
                         batch_size=batch_size)

コード例 #12

0

ファイルを表示

    def process_data(self, filename):
        """
        Load the file and preprocess th data
        """
        self.data = preprocess(filename)

        self.tablecmd, self.tablespeed, self.tableacc, self.speedsection, self.accsection, self.timesection = process(
            self.data)

コード例 #13

0

ファイルを表示

ファイル: plot_data.py プロジェクト: GeoffGao/apollo

    def process_data(self, filename):
        """
        load the file and preprocess th data
        """
        self.data = preprocess(filename)

        self.tablecmd, self.tablespeed, self.tableacc, self.speedsection, self.accsection, self.timesection = process(
            self.data)

コード例 #14

0

ファイルを表示

ファイル: dataset.py プロジェクト: MaybeS/AI-Hackathon

    def __init__(self, dataset_path: str, max_length: int):
        queries_path = os.path.join(dataset_path, 'train', 'train_data')
        labels_path = os.path.join(dataset_path, 'train', 'train_label')

        with open(queries_path, 'rt', encoding='utf8') as f:
            self.queries = preprocess(f.readlines(), max_length)
        with open(labels_path) as f:
            self.labels = np.array([[np.float32(x)] for x in f.readlines()])

コード例 #15

0

ファイルを表示

ファイル: dataset.py プロジェクト: MaybeS/AI-Hackathon

    def __init__(self, dataset_path: str, vocasize: int, minlen: int,
                 maxlen: int):
        data_review = path.join(dataset_path, 'train', 'train_data')
        data_label = path.join(dataset_path, 'train', 'train_label')

        with open(data_review, 'rt', encoding='utf-8') as f:
            self.reviews = preprocess(f.readlines(), vocasize, minlen, maxlen)

        with open(data_label) as f:
            self.labels = [np.float32(x) for x in f.readlines()]

コード例 #16

0

ファイルを表示

    def infer(raw_data, **kwargs):
        data = preprocess(raw_data, config.vocasize, config.minlen,
                          config.maxlen)
        model.eval()

        prediction = model(data)
        point = prediction.data.squeeze(dim=1).tolist()

        # DONOTCHANGE: They are reserved for nsml
        # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
        return list(zip(np.zeros(len(point)), point))

コード例 #17

0

ファイルを表示

ファイル: eval.py プロジェクト: ywu40/PCGCv1

def test_hyper(input_file,
               model,
               ckpt_dir,
               scale,
               cube_size,
               min_num,
               postfix=''):
    # Pre-process
    cubes, cube_positions, points_numbers = preprocess(input_file, scale,
                                                       cube_size, min_num)
    ### Encoding
    y_strings, y_min_vs, y_max_vs, y_shape, z_strings, z_min_v, z_max_v, z_shape, x_ds = compress_hyper(
        cubes, model, ckpt_dir, True)
    # Write files
    filename = os.path.split(input_file)[-1][:-4]
    print(filename)
    rootdir = './compressed' + postfix + '/'
    bytes_strings, bytes_strings_head, bytes_strings_hyper, bytes_pointnums, bytes_cubepos = write_binary_files_hyper(
        filename, y_strings.numpy(), z_strings.numpy(), points_numbers,
        cube_positions, y_min_vs.numpy(), y_max_vs.numpy(), y_shape.numpy(),
        z_min_v.numpy(), z_max_v.numpy(), z_shape.numpy(), rootdir)
    # Read files
    y_strings_d, z_strings_d, points_numbers_d, cube_positions_d,  y_min_vs_d, y_max_vs_d, y_shape_d, z_min_v_d, z_max_v_d, z_shape_d =  \
        read_binary_files_hyper(filename, rootdir)
    # Decoding
    cubes_d = decompress_hyper(y_strings_d, y_min_vs_d.astype('int32'),
                               y_max_vs_d.astype('int32'), y_shape_d,
                               z_strings_d, z_min_v_d, z_max_v_d, z_shape_d,
                               model, ckpt_dir)
    # cheat!!!
    ##############
    print("decoding error on gpu", "!" * 20,
          np.max(tf.abs(cubes_d - x_ds).numpy()), "!" * 20)
    cubes_d = x_ds
    ##############
    # bpp
    N = get_points_number(input_file)
    bpp = round(
        8 * (bytes_strings + bytes_strings_head + bytes_strings_hyper +
             bytes_pointnums + bytes_cubepos) / float(N), 4)

    bpp_strings = round(8 * bytes_strings / float(N), 4)
    bpp_strings_hyper = round(8 * bytes_strings_hyper / float(N), 4)
    bpp_strings_head = round(8 * bytes_strings_head / float(N), 4)
    bpp_pointsnums = round(8 * bytes_pointnums / float(N), 4)
    bpp_cubepos = round(8 * bytes_cubepos / float(N), 4)
    bpps = [
        bpp, bpp_strings, bpp_strings_hyper, bpp_strings_head, bpp_pointsnums,
        bpp_cubepos
    ]

    return cubes_d, cube_positions_d, points_numbers_d, N, bpps

コード例 #18

0

ファイルを表示

ファイル: data.py プロジェクト: rowancheung/fetal

    def __init__(self,
                 input_files,
                 label_files=None,
                 concat_files=None,
                 batch_size=1,
                 rotation_range=90.,
                 shift_range=0.1,
                 shear_range=0.1,
                 zoom_range=0.1,
                 crop_size=constants.SHAPE,
                 fill_mode='nearest',
                 cval=0.,
                 flip=True,
                 label_types=None):
        self.input_files = input_files
        self.label_files = label_files
        self.inputs = [preprocess(file) for file in input_files]

        if concat_files is not None:
            concats = [[preprocess(file) for file in channel] for channel in concat_files]
            self.inputs = np.concatenate((self.inputs, concats*))

        if label_files is not None:
            self.labels = [preprocess(file) for file in label_files]
        else:
            self.labels = None

        self.label_types = label_types

        image_transformer = ImageTransformer(rotation_range=rotation_range,
                                             shift_range=shift_range,
                                             shear_range=shear_range,
                                             zoom_range=zoom_range,
                                             crop_size=crop_size,
                                             fill_mode=fill_mode,
                                             cval=cval,
                                             flip=flip)

        super().__init__(self.inputs, self.labels, image_transformer, batch_size=batch_size)

コード例 #19

0

ファイルを表示

ファイル: api.py プロジェクト: neuromarket/optimizer

def json():
    """
    Process ads JSON with daily breakdown of channel (optional), ad_id,
    impressions, engagements, clicks and conversions;
    return options with suggested status or share for next period
    """

    # Check if JSON contains required data
    if not 'optimize' in request.json:  # pragma: no cover
        if not 'stats' in request.json:
            return '"optimize" and "stats" keys missing in posted JSON object'
        return '"optimize" key missing in posted JSON object'
    if not 'stats' in request.json:  # pragma: no cover
        return '"stats" key missing in posted JSON object'

    if not request.json['optimize']:  # pragma: no cover
        if not request.json['stats']:
            return '"optimize" and "stats" keys are empty'
        return '"optimize" key is empty'
    if not request.json['stats']:  # pragma: no cover
        return '"stats" key is empty'

    weights = {
        'impression_weight': 0,
        'engagement_weight': 0,
        'click_weight': 0,
        'conversion_weight': 0
    }
    for metric in request.json['optimize']:
        weights[metric[:-1] + '_weight'] = None

    data = pd.DataFrame(request.json['stats'])
    data = pro.preprocess(data, **weights)
    data = pro.filter_dates(data, cutoff=CUTOFF)
    [options, data] = pro.reindex_options(data)
    bandit = add_daily_results(data,
                               num_options=len(options),
                               memory=True,
                               shape=SHAPE,
                               cutoff=CUTOFF,
                               cut_level=CUT_LEVEL)
    shares = choose(bandit=bandit, accelerate=True)
    options = format_results(options, shares)
    return options.to_json(orient='records')

コード例 #20

0

ファイルを表示

    def __init__(self,
                 frames,
                 input_file_format=None,
                 label_file_format=None,
                 label_types=None,
                 load_files=True,
                 random_gen=False,
                 augment=False,
                 resize=False,
                 tile_inputs=False,
                 batch_size=1,
                 seed=None,
                 ):

        self.frames = frames
        self.samples = list(self.frames.keys())
        self.load_files = load_files
        self.random_gen = random_gen
        self.augment = augment
        self.resize = resize
        self.tile_inputs = tile_inputs

        self.input_file_format = input_file_format
        self.label_file_format = label_file_format
        self.input_files = []
        self.label_files = None if self.label_file_format is None else []
        self.label_types = label_types

        if not self.random_gen and self.input_file_format is not None:
            for s in self.frames:
                for n in self.frames[s]:
                    self.input_files.append(_format(self.input_file_format, s, n))
                    if self.label_file_format:
                        self.label_files.append(_format(self.label_file_format, s, n))
        else:
            print("[data_utils] either {input,label}_file_format or {input,label}_file_list must be present.")
            exit(1)

        self.inputs = self.input_files
        self.labels = self.label_files

        if self.load_files:
            #this loads everything into memory
            if self.random_gen:
                raise ValueError(\
                    'Input sampling is only supported ' + \
                    'if files are not preloaded.')
            self.inputs = [preprocess(file, resize=self.resize, tile=self.tile_inputs) for file in self.input_files]
            if self.label_files is not None:
                self.labels = [preprocess(file, resize=self.resize, tile=self.tile_inputs) for file in self.label_files]
            if self.tile_inputs:
                self.inputs = np.reshape(self.inputs, (-1,) + np.asarray(self.inputs).shape[-4:])
                if self.label_files is not None:
                    self.labels = np.reshape(self.labels, (-1,) + np.asarray(self.labels).shape[-4:])
        elif self.tile_inputs:
            self.inputs = np.repeat(self.inputs, 8, axis=0)
            if self.label_files is not None:
                self.labels = np.repeat(self.labels, 8, axis=0)

        if self.augment:
            if self.tile_inputs:
                raise ValueError('Augmentation not supported if inputs are tiled.')
            self.image_transformer = ImageTransformer(rotation_range=90.,
                                                      shift_range=0.1,
                                                      shear_range=0.1,
                                                      zoom_range=0.1,
                                                      crop_size=constants.SHAPE,
                                                      fill_mode='nearest',
                                                      cval=0,
                                                      flip=True)

        super().__init__(max(len(self.samples), len(self.inputs)), batch_size, self.augment, seed)

コード例 #21

0

ファイルを表示

def predict():
    input_data = request.get_json(force=True)
    transformed_input_data = preprocess(input_data)
    prediction = model.predict(transformed_input_data)
    transformed_prediction = postprocess(prediction)
    return jsonify({"prediction": transformed_prediction})

コード例 #22

0

ファイルを表示

ファイル: Sample.py プロジェクト: isakrs/v2019-hackathon

# In[ ]:

from process import preprocess

# Get dataset with features

# In[ ]:

config = {
    'pred_var':
    'Torvet PM10',  # Must include station and pollutants name (column name)
    'stations': ['Torvet'],  # Stations to use in feature extraction
    'window': 6,
}

data = preprocess(**config)

# In[ ]:

print('X train', data['X_train'].shape)
print('y train', data['y_train'].shape)
print('X validation', data['X_val'].shape)
print('X test', data['X_test'].shape)

#print(data['X_train'].columns)

# **Train Multi Output RF | GBM | MLP**
#
# _Params are hidden inside each file_

# In[ ]:

コード例 #23

0

ファイルを表示

ファイル: api.py プロジェクト: neuromarket/optimizer

def csv():
    """
    Provide form to paste ads CSV with daily breakdown of channel (optional),
    ad_id, impressions, engagements, clicks and conversions;
    return options with suggested budget share or status for next period and
    provide direct upload to Facebook via API
    """

    if request.method == 'POST':
        if request.form['update'] == 'true':  # pragma: no cover
            app_id = request.form['app_id']
            app_secret = request.form['app_secret']
            access_token = request.form['access_token']
            channels = ast.literal_eval(request.form['channels'])
            records = ast.literal_eval(request.form['records'])
            updatable = ['facebook', 'instagram']
            indices = []
            for channel in updatable:
                if channel in channels:
                    indices.append(channels.index(channel))
            results = pd.DataFrame(columns=['ad_id', 'ad_status'])
            for index in indices:
                for record in records[index]:
                    results.loc[len(results)] = \
                        [record['ad_id'], record['ad_status']]
            updated = update_facebook(app_id, app_secret, access_token,
                                      results)
            records = updated.to_dict('records')
            columns = updated.columns.values
            return render_template('update_result.html',
                                   records=records,
                                   columns=columns)

        weights = {}
        for weight in [
                'impression_weight', 'engagement_weight', 'click_weight',
                'conversion_weight'
        ]:
            if request.form[weight] == '':
                weights[weight] = None
            else:
                weights[weight] = int(request.form[weight])

        data = pd.read_csv(StringIO(request.form['ads']),
                           sep=None,
                           engine='python')

        try:
            data = pro.preprocess(data, weights['impression_weight'],
                                  weights['engagement_weight'],
                                  weights['click_weight'],
                                  weights['conversion_weight'])
        except Exception as error:  # pragma: no cover
            print(error)
            message = 'Cannot pre-process your data. \
                     Please check the CSV input format and try again.'

            return render_template(
                'csv.html',
                error=message,
                output=request.form['output'],
                impression_weight=request.form['impression_weight'],
                engagement_weight=request.form['engagement_weight'],
                click_weight=request.form['click_weight'],
                conversion_weight=request.form['conversion_weight'],
                ads=request.form['ads'])

        try:
            data = pro.filter_dates(data, cutoff=CUTOFF)
        except Exception as error:  # pragma: no cover
            print(error)
            message = 'Please check your dates (format should be YYYY-MM-DD).'
            return render_template(
                'csv.html',
                error=message,
                output=request.form['output'],
                impression_weight=request.form['impression_weight'],
                engagement_weight=request.form['engagement_weight'],
                click_weight=request.form['click_weight'],
                conversion_weight=request.form['conversion_weight'],
                ads=request.form['ads'])
        if data.empty:  # pragma: no cover
            error = 'Please include results with data from the past ' + str(
                CUTOFF) + ' days.'
            return render_template(
                'csv.html',
                error=error,
                output=request.form['output'],
                impression_weight=request.form['impression_weight'],
                engagement_weight=request.form['engagement_weight'],
                click_weight=request.form['click_weight'],
                conversion_weight=request.form['conversion_weight'],
                ads=request.form['ads'])

        [options, data] = pro.reindex_options(data)

        bandit = add_daily_results(data,
                                   num_options=len(options),
                                   memory=True,
                                   shape=SHAPE,
                                   cutoff=CUTOFF,
                                   cut_level=CUT_LEVEL)

        shares = choose(bandit=bandit, accelerate=True)

        output = request.form['output']
        if output == 'status':
            results = format_results(options, shares, status=True)
        elif output == 'share':
            results = format_results(options, shares, status=False).round(2)

        if 'channel' in options.columns:
            channel_shares = format_results(options, shares, status=False). \
                groupby('channel')['ad_share'].sum().round(2)
            channels = []
            records = []
            for name, group in results.groupby('channel'):
                channels.append(name)
                group = group.drop(['channel'], axis=1)
                columns = group.columns.values
                records.append(group.to_dict('records'))
            return render_template('csv_result_channels.html',
                                   channels=channels,
                                   channel_shares=channel_shares,
                                   records=records,
                                   columns=columns)

        records = results.to_dict('records')
        columns = results.columns.values
        return render_template('csv_result.html',
                               records=records,
                               columns=columns)

    return render_template('csv.html')

コード例 #24

0

ファイルを表示

ファイル: main.py プロジェクト: FinalGetsugaa/beta24_LazyLeaves

    'SmsSid': 'SM994801c6ee52cb08db6affa285661e12',
    'FromState': 'AZ',
    'SmsStatus': 'received',
    'FromCity': 'PHOENIX',
    'Body': 'go from Delhi to Gurgaon',
    'FromCountry': 'US',
    'To': '%2B14804284194',
    'ToZip': '85034',
    'NumSegments': '1',
    'MessageSid': 'SM994801c6ee52cb08db6affa285661e12',
    'AccountSid': 'ACffa2ba37390d2cc87d8b52bf6d869c2a',
    'From': '%2B16022886791',
    'ApiVersion': '2010-04-01'
}
test_string = response['Body']
message_body = preprocess(test_string)

query_type = find_query_type(message_body)
# print(query_type)
ans = ""
if query_type == 1:
    source, destination = find_src_dest(message_body)
    print(source, destination)
    ans = route1(source, destination)
else:
    # source, destination = find_src_dest(message_body)
    message_body = message_body.lower()
    near_me, query = near_locs(message_body)
    source = find_src(query)
    print('near me: ', near_me)
    print('source:', source)

コード例 #25

0

ファイルを表示

ファイル: naivebayes_predict.py プロジェクト: ParthTandel/project

from time import time
from process import preprocess
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif


features_train_vect,features_train, features_test, labels_train, labels_test = preprocess()

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
import collections

t0 = time()
pred = clf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(pred , labels_test)
print accuracy


from bs4 import BeautifulSoup
from newspaper import Article

urls = ['http://www.newsmax.com/Politics/putin-tv-trump-dangerous/2017/04/17/id/784706/',
        'http://www.hollywoodreporter.com/heat-vision/star-wars-rare-archival-footage-shown-at-celebration-had-funny-new-hope-f-bomb-994552?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thr%2Ffilm+%28The+Hollywood+Reporter+-+Movies%29&utm_content=FeedBurner',
        'http://www.espn.com/sports/endurance/story/_/id/19177433/boston-marathon-2017-devin-wang-another-year-brings-closure-tragedy']

コード例 #26

0

ファイルを表示

def lambda_handler(event, context):
    to_number = "+919079945319"
    from_number = "+13345131650"
    test_string = event['Body']
    message_body = preprocess(test_string)

    query_type = find_query_type(message_body)
    # print(query_type)
    ans = ""
    if query_type == 1:
        source, destination = find_src_dest(message_body)
        ans += "\nDirections \n"
        ans += "From :- " + source + "\n"
        ans += "To :- " + destination + "\n"
        ans += route1(source, destination)
    else:
        # source, destination = find_src_dest(message_body)
        message_body = message_body.lower()
        near_me, query = near_locs(message_body)
        source = find_src(query)
        ans += "\nFollowing results were fetched near your current location :- \n"
        ans += route2(source, near_me)
    # location_from, location_to = find_src_dest(message_body)

    # response_from = requests.get(
    #     "https://nominatim.openstreetmap.org/?addressdetails=1&q=" + location_from + "&format=json&limit=1")
    # response_to = requests.get(
    #     "https://nominatim.openstreetmap.org/?addressdetails=1&q=" + location_to + "&format=json&limit=1")
    # lat_from = response_from.json()[0]['lat']
    # lon_from = response_from.json()[0]['lon']

    # query_type = 0
    # ans = ""
    # near_me = "hospital"

    # if query_type == 0:

    #     lat_to = response_to.json()[0]['lat']
    #     lon_to = response_to.json()[0]['lon']

    #     response_route = requests.get(
    #         'http://www.mapquestapi.com/directions/v2/route?key=j1IVnoFZUzzkteLml8NKw1wjF5x5mGK3&from=' + lat_from + ',' + lon_from + '&to=' + lat_to + ',' + lon_to)
    #     print("Start Point:", location_from, lat_from, lon_from)
    #     print("End Point:", location_to, lat_to, lon_to)
    #     direction = ["none", "north", "northwest", "northeast", "south", "southeast", "southwest", "west", "east"]
    #     turnType = ["straight", "slight right", "right", "sharp right", "reverse", "sharp left", "left", "slight left",
    #                 "right u-turn", "left u-turn", "right merge", "left merge", "right on ramp", "left on ramp",
    #                 "right off ramp", "left off ramp", "right fork", "left fork", "straight fork"]

    #     for obb in response_route.json()["route"]["legs"][0]["maneuvers"]:
    #         s1 = "Take " + turnType[obb["turnType"]] + " and go " + str(int(obb["distance"] * 1609)) + " meters, in " + \
    #              direction[obb["direction"]] + " direction"
    #         s2 = obb["narrative"]
    #         ans += s1 + '\n' + s2 + '\n\n'
    #     # print(s1)
    #     # print(s2, end="\n\n")

    # elif query_type == 1:
    #     r = requests.get(
    #         'http://open.mapquestapi.com/nominatim/v1/search.php?key=j1IVnoFZUzzkteLml8NKw1wjF5x5mGK3&format=json&q=' + lat_from + ',' + lon_from + '+[' + near_me + ']&addressdetails=1&limit=20')
    #     print(r.json())
    #     for obb in r.json():
    #         ans += obb["display_name"] + '\n'
    #     # print(i["display_name"])

    # ans = ans.strip()
    # print(ans)

    body = ans
    print(body)
    print(event)

    if not TWILIO_ACCOUNT_SID:
        return "Unable to access Twilio Account SID."
    elif not TWILIO_AUTH_TOKEN:
        return "Unable to access Twilio Auth Token."
    elif not to_number:
        return "The function needs a 'To' number in the format +12023351493"
    elif not from_number:
        return "The function needs a 'From' number in the format +19732644156"
    elif not body:
        return "The function needs a 'Body' message to send."

    # insert Twilio Account SID into the REST API URL
    populated_url = TWILIO_SMS_URL.format(TWILIO_ACCOUNT_SID)
    post_params = {"To": to_number, "From": from_number, "Body": body}

    # encode the parameters for Python's urllib
    data = parse.urlencode(post_params).encode()
    req = request.Request(populated_url)

    # add authentication header to request based on Account SID + Auth Token
    authentication = "{}:{}".format(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
    base64string = base64.b64encode(authentication.encode('ascii'))
    req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))

    try:
        # perform HTTP POST request
        with request.urlopen(req, data) as f:
            print("Twilio returned {}".format(str(f.read().decode('utf-8'))))
    except Exception as e:
        print("something went wrong!")
        return e

    return ''

コード例 #27

0

ファイルを表示

ファイル: wordCount.py プロジェクト: arthurfanny/twittermining

import string
import collections
from collections import Counter
from nltk.corpus import stopwords
import vincent

#Stop Words
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['RT', 'via','de','o']

with open('data/stream_twitterarthursun.json', 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        # Create a list with all the terms
        terms_all = [term for term in process.preprocess(tweet['text']) if term not in stop]
        #terms_hash = [term for term in process.preprocess(tweet['text'])
         #     if term.startswith('#')]
        #terms_single = set(terms_all)
        #terms_only = [term for term in process.preprocess(tweet['text'])
        #      if term not in stop and
        #      not term.startswith(('#', '@'))]
        #Update the counter
        count_all.update(terms_all)
        #count_all.update(terms_hash)
        #count_all.update(terms_only)
        #count_all.update(terms_single)

コード例 #28

0

ファイルを表示


def read_class_name(path):
    df_thmia = pd.read_csv(path, header=None, sep="\t")
    name2idx = {
        name: i
        for i, name in enumerate(list(df_thmia.values.reshape(-1)))
    }
    idx2name = {idx: name for name, idx in name2idx.items()}
    return name2idx, idx2name


if __name__ == '__main__':
    from process import preprocess

    preprocess()
    name2idx, idx2name = read_class_name(config.arrythmia)
    x_test = pd.read_csv(r"../user_data/x_test.csv").iloc[:, 1:]
    x_test['age'].fillna(42.627019408001736, inplace=True)
    print(x_test)
    clf_list = load_model("model.ml")
    test = pd.read_csv(config.test_label, sep='\t', header=None, dtype=str)
    p = [[] for i in range(test.iloc[:, 0].size)]

    for i in tqdm(range(34)):
        print(str(i) + ":" + idx2name[i])
        clf = clf_list[i]
        p_test = clf.predict(x_test)

        for j in range(len(p_test)):
            if p_test[j] == 1:

コード例 #29

0

ファイルを表示

def main():
    LOGGER_LEVEL = 10
    RAW_DATA_PATH = './data/raw/'
    RAW_CSV_NAME = 'raw_data.csv'

    t0 = time.time()
    logger = config.config_logger(__name__, LOGGER_LEVEL)
    pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x))
    logger.info('Beginning execution: zika dataset')
    logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL))

    logger.info('Opening CSV: {0}{1}'.format(RAW_DATA_PATH, RAW_CSV_NAME))
    raw_data = pd.read_csv(RAW_DATA_PATH + RAW_CSV_NAME)
   
    logger.info('Raw dataset description:') 
    process.basic_descriptives(raw_data)
    raw_data = process.preprocess(raw_data) 
    #print(raw_data.describe().transpose().to_string())
    #print(raw_data.head().to_string())
    #print(raw_data.info().to_string())

    y_dengue = raw_data['dengue_pcr']
    y_zika = raw_data['zika_pcr']
    y_chik = raw_data['chik_pcr']
    diseases = [y_dengue, y_zika, y_chik]
    # Check process code for further explanation of select_disease function.
    # code: 1. Dengue, 2. Zika, 3. Chik, 4. Any
    # only_one: if True, input np.nan to patients with another disease.
    y = process.select_disease(diseases, code=1, only_one=False)
    logger.info('Target var frequency: \n{0}'.format(y.value_counts()))
    logger.info('Total obs: {0}'.format(y.value_counts().sum()))

    remove_list = ['id', 'centro_pob', 'name', 'dep', 'prov', 'dist',
                   'serotipo1', 'serotipo2', 'serotipo3', 'serotipo4',
                   'dengue_pcr', 'zika_pcr', 'chik_pcr']

    X = process.remove_vars(raw_data, remove_list)
    X = process.keep_non_nan(X, y)
    y = y.dropna()

    logger.info('Features dataset')
    process.basic_descriptives(X)

    logger.info('Split train test')
    X_train, X_test, y_train, y_test = models.split_data(X, y, proportion=0.4)

    logger.info('Estimating models')
    logger.info('GBM')
    grid_gbm = models.gbm_grid(X_train, y_train, n_cv=5)
    logger.info(grid_gbm.best_params_)
    logger.info('Train score: {0}'.format(grid_gbm.best_score_))
    logger.info('Test score: {0}'.format(grid_gbm.score(X_test, y_test)))

    logger.info('Logit')
    grid_logit = models.logit_grid(X_train, y_train, n_cv=5)
    logger.info(grid_logit.best_params_)
    logger.info('Train score: {0}'.format(grid_logit.best_score_))
    logger.info('Test score: {0}'.format(grid_logit.score(X_test, y_test)))

    logger.info('AdaBoost')
    grid_adaboost = models.adaboost_grid(X_train, y_train, n_cv=5)
    logger.info(grid_adaboost.best_params_)
    logger.info('Train score: {0}'.format(grid_adaboost.best_score_))
    logger.info('Test score: {0}'.format(grid_adaboost.score(X_test, y_test)))

    logger.info('Soft Voting')
    eclf = VotingClassifier(estimators=[('gbm', grid_gbm), ('logit', grid_logit),
                                        ('ada', grid_adaboost)], voting='soft')
    eclf.fit(X_train, y_train)
    y_pred = eclf.predict_proba(X_test)
    print(y_pred[:5,:])
    logger.info('Train score: {0}'.format(eclf.score(X_train, y_train)))
    logger.info('Test score: {0}'.format(eclf.score(X_test, y_test)))

    config.time_taken_display(t0)

コード例 #30

0

ファイルを表示

ファイル: test.py プロジェクト: xtorker/PCGCv1

    python test.py decompress "compressed/longdress_vox10_1300" \
        --ckpt_dir="checkpoints/hyper/a6b3/"
    """

    args = parse_args()
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 1.0
    config.gpu_options.allow_growth = True
    config.log_device_placement=True
    sess = tf.Session(config=config)

    model = importlib.import_module(args.modelname)

    if args.mode == "factorized":
        if args.command == "compress":
            cubes, cube_positions, points_numbers = preprocess(args.input, args.scale, args.cube_size, args.min_num)
            strings, min_v, max_v, shape = compress_factorized(cubes, model, args.ckpt_dir)
            if not args.output:
                args.output = os.path.split(args.input)[-1][:-4]
                rootdir = './compressed'
            else:
                rootdir, args.output = os.path.split(args.output)
            bytes_strings, bytes_pointnums, bytes_cubepos = write_binary_files_factorized(
                args.output, strings.numpy(), points_numbers, cube_positions, min_v.numpy(), max_v.numpy(), shape.numpy(), rootdir=rootdir)

        elif args.command == "decompress":
            rootdir, filename = os.path.split(args.input)
            if not args.output:
                args.output = filename + "_rec.ply"
            strings_d, points_numbers_d, cube_positions_d, min_v_d, max_v_d, shape_d = read_binary_files_factorized(filename, rootdir)
            cubes_d = decompress_factorized(strings_d, min_v_d, max_v_d, shape_d, model, args.ckpt_dir)

コード例 #31

0

ファイルを表示

import nltk

# nltk downloads
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# loading the dataset
print("Loading the dataset .......")
reviews_df = pd.read_csv("../dataset/AllProductReviews.csv")

# removing neutral reviews
reviews_df = reviews_df[reviews_df["ReviewStar"] != 3]

# preprocessing the reviews
reviews_df = process.preprocess(reviews_df)

# word stemming with Part of speech tagging
reviews_df = process.word_stemming(reviews_df)

# spliting the dataset in training, cross validation set and testing set
train_df, test_df = train_test_split(reviews_df, test_size=0.4, shuffle=True)

# extracting the features from the text data
aspects, values = feature_processing.feature_extraction(train_df)

# creating the feature vectors
feature_vectors, y = feature_processing.create_feature_vector(
    train_df, aspects, values)
feature_vectors_test, y_test = feature_processing.create_feature_vector(
    test_df, aspects, values)

コード例 #32

0

ファイルを表示

ファイル: metoncofit.py プロジェクト: kirksmi/MetOncoFit

"""

import sys
import process
import random_forest
import validator
import visualizations
import save

import pandas as pd
from openpyxl import load_workbook

# Create data structures that will be used in the analysis
df, df1, header, canc, targ, data, classes, orig_data, orig_classes, excl_targ, freq = process.preprocess(
    datapath='./../data/median/',
    fil=sys.argv[1],
    targ=sys.argv[2],
    exclude=sys.argv[3])

# Random Forest Classifier, prediction, and hold out accuracy
rfc, rfc_pred, mean_acc = random_forest.random_forest(canc, targ, data,
                                                      classes, orig_data,
                                                      orig_classes)

# Model performance and statistical measures. THIS FUNCTION IS ALSO NECESSARY TO GENERATE THE FIGURES.
cm, pvalue, zscore, cv_score, summary = validator.summary_statistics(
    rfc, rfc_pred, data, classes, orig_classes, orig_data, targ, excl_targ,
    mean_acc, canc)

# Model comparison with Auslander et al., 2016. Use only gene expression in these predictions for a true comparison.
#df2 = df1.copy(deep=True)

コード例 #33

0

ファイルを表示

ファイル: rewritesummary.py プロジェクト: beichao1314/TREC2016

    def pushSummarys(self, tweet):
        if ('delete' not in tweet) and (tweet['lang'] == 'en'):
            if 'retweeted_status' in tweet:
                tem = tweet['retweeted_status']
                tem['timestamp_ms'] = tweet['timestamp_ms']
                tem['created_at'] = tweet['created_at']
                tweet = tem
            # t1 = T.time()
            delta = self.time.calculatetime(tweet['created_at'])
            if delta >= 1:
                for x in range(self.L):
                    self.numofdayA[x] = 0
                    self.numofdayB[x] = 0
                    stemwords_interest_profile = self.interest_files[x]
                    listofsummaryA = [summary[0] for summary in self.summaryA[x] if summary[1] == self.day]
                    if len(listofsummaryA) > 0:
                        self.tfidfthresholdA[x] = min(summaryA[2] for summaryA in listofsummaryA)
                    #     # self.jsdthresholdA[x] = min(summaryA[3] for summaryA in listofsummaryA)
                    del listofsummaryA
                    listofsummaryB = [summary[0] for summary in self.summaryB[x] if summary[1] == self.day]
                    if len(listofsummaryB) > 0:
                        self.tfidfthresholdB[x] = min(summaryB[2] for summaryB in listofsummaryB)
                        # self.jsdthresholdB[x] = min(summaryB[3] for summaryB in listofsummaryB)
                        sumoflen = sum(summaryBBBB[5] for summaryBBBB in listofsummaryB)
                        ADL = sumoflen / len(listofsummaryB)
                        lenofq = len(stemwords_interest_profile)
                        result = []
                        for summaryBBB in listofsummaryB:
                            score = 0
                            TF = summaryBBB[4]
                            for q in stemwords_interest_profile:
                                tf = TF[q]
                                avgtf = sum(TF[qq] for qq in stemwords_interest_profile) / len(TF)
                                RITF = math.log2(1 + tf) / math.log2(1 + avgtf)
                                LRTF = tf * math.log2(1 + ADL / summaryBBB[5]) + 0.0001
                                w = 2 / (1 + math.log2(1 + lenofq))
                                TFF = w * RITF / (1 + RITF) + (1 - w) * LRTF / (1 + LRTF)
                                IDF = math.log((len(listofsummaryB) + 1) / (self.qoccur[x][q] + 1)) + 0.0001
                                AEF = self.numofq[x][q] / (self.qoccur[x][q] + 1)
                                TDF = IDF * AEF / (1 + AEF)
                                sim = TFF * TDF
                                score += sim
                                del tf, avgtf, RITF, LRTF, w, TFF, IDF, AEF, TDF, sim
                            # score += summaryBBB[3]

                            result.append([score, summaryBBB[1]])
                        del listofsummaryB
                        result.sort(key=operator.itemgetter(0), reverse=True)
                        j = 1
                        day = str(self.day + 1)
                        # d = '201507' + day
                        for i in result:
                            if (self.day) >= 9:
                                d = '201608' + day
                            else:
                                d = '2016080' + day
                            with open('B.txt', 'a') as ff:
                                ff.write(
                                    '%s %s Q0 %s %s %s CCNUNLPrun1\n' % (d, self.topicid[x], i[1], str(j), i[0]))
                            j = j + 1
                self.time.settime()
                self.day = self.day + 1
            content = tweet['text']
            stemwords_tweet = preprocess(content)
            del content
            wordInTweet = {}
            if stemwords_tweet == False:
                pass
            else:
                numOfWordAtweet = len(stemwords_tweet)
                self.SumOfLenthOfStream = numOfWordAtweet + self.SumOfLenthOfStream
                id_str = tweet['id_str']
                for word in stemwords_tweet:
                    if word in self.wordInStream:
                        self.wordInStream[word] += 1
                    else:
                        self.wordInStream[word] = 1
                    if word in wordInTweet:
                        wordInTweet[word] += 1
                    else:
                        wordInTweet[word] = 1
                for x in range(self.L):
                    stemwords_interest_profile = self.interest_files[x]
                    count = sum(stemwords_tweet.count(wordsss) for wordsss in stemwords_interest_profile)
                    # print(count)
                    if count >= 2:
                        sumoftfidf = 0.0
                        del count
                        for word in stemwords_tweet:
                            if word in self.queries_word[x]:
                                self.queries_word[x][word] += 1
                            else:
                                self.queries_word[x][word] = 1
                        for word in set(stemwords_tweet):
                            if word not in self.queries_occur[x]:
                                self.queries_occur[x][word] = 1
                            else:
                                self.queries_occur[x][word] += 1
                        self.queries_numOfWord[x] += numOfWordAtweet
                        self.queries_numOfTweet[x] += 1

                        for word in stemwords_tweet:
                            tf = self.queries_word[x][word] / self.queries_numOfWord[x]
                            idf = math.log2((self.queries_numOfTweet[x] + 1) / self.queries_occur[x][word])
                            sumoftfidf = sumoftfidf + tf * idf
                        if sumoftfidf >= self.tfidfthresholdA[x] and self.numofdayA[x] < 10:
                            listofsummaryA = [summary[0] for summary in self.summaryA[x]]
                            if len(listofsummaryA) > 0:
                                jsd = []
                                for summary in listofsummaryA:
                                    sumofjsd = 0
                                    tf = {}
                                    for wordss in summary[0]:
                                        if wordss in tf:
                                            tf[wordss] += 1
                                        else:
                                            tf[wordss] = 1
                                    sameword = [word for word in stemwords_tweet if
                                                word in summary[0]]
                                    if len(sameword) > 0:
                                        for word in sameword:
                                            Pti = float(wordInTweet[word]) / float(numOfWordAtweet)
                                            Psi = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream)
                                            thetaTi = self.lemda * Pti + (1 - self.lemda) * Psi
                                            Ptj = float(tf[word]) / float(len(summary[0]))
                                            Psj = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream)
                                            thetaTj = self.lemda * Ptj + (1 - self.lemda) * Psj
                                            # sumofjsd += thetaTi * math.log(thetaTj/thetaTj)
                                            M = (thetaTi + thetaTj) / 2
                                            sumofjsd += (0.5 * (thetaTi * math.log(thetaTi / M)) + 0.5 * (
                                                thetaTj * math.log(thetaTj / M)))
                                        jsd.append(sumofjsd)
                                    else:
                                        jsd.append(0.07)
                                JSD = min(jsd)
                            else:
                                JSD = 0.05
                            # print(JSD)
                            if JSD >= self.jsdthresholdA[x]:
                                # print(self.topicid[x]+str(type(self.topicid[x])))
                                # print(id_str+str(type(id_str)))
                                #self.rest.Post(self.topicid[x], id_str)
                                self.jsdthresholdA[x]=JSD
                                self.numofdayA[x] += 1
                                a = [stemwords_tweet, id_str, sumoftfidf, JSD]
                                self.summaryA[x].append([a, self.day])
                                self.fa.write('%s %s tfidf:%s jsd:%s\n' % (self.day, self.topicid[x], sumoftfidf, JSD))
                        if sumoftfidf >= self.tfidfthresholdB[x] and self.numofdayB[x] < 100:
                            listofsummaryB = [summary[0] for summary in self.summaryB[x]]
                            if len(listofsummaryB) > 0:
                                jsd = []
                                for summary in listofsummaryB:
                                    sumofjsd = 0
                                    sameword = [word for word in stemwords_tweet if word in summary[0]]
                                    tf = {}
                                    for wordss in summary[0]:
                                        if wordss in tf:
                                            tf[wordss] += 1
                                        else:
                                            tf[wordss] = 1
                                    if len(sameword) > 0:
                                        for word in sameword:
                                            Pti = float(wordInTweet[word]) / float(numOfWordAtweet)
                                            Psi = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream)
                                            thetaTi = self.lemda * Pti + (1 - self.lemda) * Psi
                                            Ptj = float(tf[word]) / float(len(summary[0]))
                                            Psj = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream)
                                            thetaTj = self.lemda * Ptj + (1 - self.lemda) * Psj
                                            # sumofjsd += thetaTi * math.log(thetaTi/thetaTj)
                                            M = float(thetaTi + thetaTj) / 2
                                            sumofjsd += 0.5 * (thetaTi * math.log(thetaTi / M)) + 0.5 * (
                                                thetaTj * math.log(thetaTj / M))
                                        jsd.append(sumofjsd)
                                    else:
                                        jsd.append(0.07)
                                JSD = min(jsd)
                            else:
                                JSD = 0.05
                            # print(JSD)
                            if JSD >= self.jsdthresholdB[x]:
                                self.numofdayB[x] += 1
                                lenoflistB = len(listofsummaryB)
                                self.jsdthresholdB[x] = (lenoflistB * self.jsdthresholdB[x] + JSD) / (lenoflistB + 1)
                                TF = {}
                                for q in stemwords_interest_profile:
                                    TF[q] = stemwords_tweet.count(q)
                                    if q in stemwords_tweet:
                                        if q in self.qoccur[x]:
                                            self.qoccur[x][q] += 1
                                        else:
                                            self.qoccur[x][q] = 1
                                    else:
                                        self.qoccur[x][q] = 0
                                    if q in self.numofq[x]:
                                        self.numofq[x][q] += stemwords_tweet.count(q)
                                    else:
                                        self.numofq[x][q] = stemwords_tweet.count(q)
                                b = [stemwords_tweet, id_str, sumoftfidf, JSD, TF, numOfWordAtweet]
                                self.summaryB[x].append([b, self.day])
        pass