Exemple #1
0
def _save_obj_ver_edge(obj_ver_key, obj_key):
    """Save the ws_version_of edge."""
    # The _from is a version of the _to
    from_id = 'ws_object_version/' + obj_ver_key
    to_id = 'ws_object/' + obj_key
    logger.debug(f'Saving ws_version_of edge from {from_id} to {to_id}')
    save('ws_version_of', [{'_from': from_id, '_to': to_id}])
Exemple #2
0
 def craw_stat(self):
     for c in self.sources:
         logger.debug(c["city"] + ": " + c["home"])
         html = download(c["home"], charset="utf-8")
         soup = BeautifulSoup(html, "html.parser")
         deal_div = soup.find("div", {"class": "deal-price"})
         deal_price = 0.0
         list_price = 0.0
         rate = 0.0
         vol = 0
         if deal_div and deal_div.find("label", {"class": "dataAuto"}):
              deal_price = deal_div.find("label", {"class": "dataAuto"}).text.strip().encode("utf-8")
         list_div = soup.find("div", {"class": "listing-price"})
         if list_div and list_div.find("label", {"class": "dataAuto"}):
             list_price = list_div.find("label", {"class": "dataAuto"}).text.strip().encode("utf-8")
         ul = soup.find("div", {"class": "main"}).findAll("li")
         for li in ul:
             if li.find("p").text:
                 if re.findall("客房比", li.find("p").text.encode("utf-8")):
                     rate = li.find("label").text.strip().encode("utf-8")
                 if re.findall("成交", li.find("p").text.encode("utf-8")):
                     vol = li.find("label").text.strip().encode("utf-8")
         item = {"price": deal_price, "rate": rate,
                 "city": c["city"], "vol": vol, "date": self.date}
         logger.debug(item)
         if float(deal_price) <=0.0 or float(rate) <= 0.0:
             continue;
         if not self.dao.has_stat(c["city"], self.date):
             self.dao.insert_stat(item)
Exemple #3
0
def _save_referral_edge(obj_ver_key, obj):
    """Save the ws_refers_to edge."""
    from_id = 'ws_object_version/' + obj_ver_key
    for upa in obj.get('refs', []):
        to_id = 'ws_object_version/' + upa.replace('/', ':')
        logger.debug(f'Saving ws_refers_to edge from {from_id} to {to_id}')
        save('ws_refers_to', [{'_from': from_id, '_to': to_id}])
Exemple #4
0
    def collate(batches):
        logger.debug('Batches: {}'.format(batches))
        batch_list = []

        for batch in batches:
            pair = np.array(batch[0])
            negs = np.array(batch[1])
            negs = np.vstack((pair[0].repeat(negs.shape[0]), negs)).T

            # Create arrays
            pair_arr = np.ones(
                (pair.shape[0]), dtype=int
            )  # This sets label to 1  # TODO: Leave label as continuous
            pair_arr[:-1] = pair[:-1]
            negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int)
            negs_arr[:, :-1] = negs
            all_arr = np.vstack((pair_arr, negs_arr))
            batch_list.append(all_arr)

        batch_array = np.vstack(batch_list)

        # Return item1, item2, label
        return (torch.LongTensor(batch_array[:, 0]),
                torch.LongTensor(batch_array[:, 1]),
                torch.FloatTensor(batch_array[:, 2]))
Exemple #5
0
def _save_owner_edge(obj_ver_key, info_tup):
    """Save the ws_owner_of edge."""
    username = info_tup[5]
    from_id = 'ws_user/' + sanitize_arangodb_key(username)
    to_id = 'ws_object_version/' + obj_ver_key
    logger.debug(f'Saving ws_owner_of edge from {from_id} to {to_id}')
    save('ws_owner_of', [{'_from': from_id, '_to': to_id}])
Exemple #6
0
 def get_history(self):
     today = datetime.now().strftime('%y-%m-%d')
     has = self.dao.has_item("bj", today)
     if has[0]:
         logger.debug("find history.")
         return
     html = download("http://www.fangchanzixun.com/volume",
                     charset="utf-8")
     soup = BeautifulSoup(html, "html.parser")
     table = soup.find('table', attrs={'class': 'table'})
     table_body = table.find('tbody')
     rows = table_body.find_all('tr')
     for row in rows:
         cols = row.find_all('td')
         cols = [ele.text.strip() for ele in cols]
         if len(cols) != 5:
             continue
         col_date = cols[0]
         total = cols[1].encode("utf-8")
         zhuzai = cols[3].encode("utf-8")
         date = datetime.strptime(col_date, "%Y-%m-%d").date()
         info = {"city": "北京", "district": "bj", "total": total, "zhuzai": zhuzai, "date": date}
         logger.debug(info)
         has = self.dao.has_item("bj", date)
         if not has[0]:
             self.dao.insert_item(info)
Exemple #7
0
    def fit(self, features: np.array, labels: np.array) -> None:
        """Builds a random forest of decision trees.

        Args:
            features:
            labels:

        Returns:
            None
        """
        n_rows, n_cols = features.shape

        for i in range(self.num_trees):
            logger.debug('{} training tree: {}'.format(self.__class__.__name__,
                                                       i + 1))
            shuffled_row_idx = np.random.permutation(n_rows)
            shuffled_col_idx = np.random.permutation(n_cols)

            row_idx = np.random.choice(shuffled_row_idx,
                                       int(self.row_subsampling * n_rows),
                                       replace=False)
            col_idx = np.random.choice(shuffled_col_idx,
                                       int(self.col_subsampling * n_cols),
                                       replace=False)
            self.col_idxs.append(col_idx)

            features_subsampled = features[np.ix_(row_idx, col_idx)]
            labels_subsampled = labels[row_idx]

            self.trees.append(
                self.__build_tree__(features_subsampled, labels_subsampled))
Exemple #8
0
def _save_ws_contains_edge(obj_key, info_tup):
    """Save the ws_workspace_contains_obj edge."""
    from_id = 'ws_workspace/' + str(info_tup[6])
    to_id = 'ws_object/' + obj_key
    logger.debug(
        f'Saving ws_workspace_contains_obj edge from {from_id} to {to_id}')
    save('ws_workspace_contains_obj', {'_from': from_id, '_to': to_id})
    def clean_dfs(self) -> [pd.DataFrame]:
        training_df = self.original_training_df
        testing_df = self.original_testing_df

        # one hot encoding columns whose representation is a dict
        columns_one_hot_encoding_dict = {'belongs_to_collection': ['id', 'collection'],
                                         'genres': ['id', 'genre'],
                                         'production_countries': ['iso_3166_1', 'prod_count'],
                                         'spoken_languages': ['iso_639_1', 'spoken_lang']}
        for col, list_specific_col in columns_one_hot_encoding_dict.items():
            logger.debug(f'{col} will be one hot encoded as dict')
            training_df, testing_df = self.__one_hot_encode_representing_as_dict(training_df, testing_df, col,
                                                                                 list_specific_col)

        # special one hot encoding columns for multitude of names inside the column
        columns_one_hot_encoding_dict = {'production_companies': ['id', 'prod_comp', True, 15],
                                         'Keywords': ['id', 'k', False, 25]}
        for col, list_specific_col in columns_one_hot_encoding_dict.items():
            logger.debug(f'{col} will be one hot encoded')
            training_df, testing_df = self.__one_hot_encode_famous_names(training_df, testing_df, col, list_specific_col)

        # one hot encoding columns for information about characters of the movies
        logger.debug(f'crew will be one hot encoded as item')
        training_df, testing_df = self.__one_hot_encode_characters(training_df, testing_df, 'crew')

        # one hot encoding columns whose one row contains only one value
        logger.debug(f'original_language will be one hot encoded as item')
        training_df, testing_df = self.__one_hot_encode_representing_as_item(training_df, testing_df,
                                                                             'original_language')

        # extract date information
        logger.debug(f'extract date information will be')
        training_df, testing_df = self.__extract_date_information(training_df, 'training'), self.__extract_date_information(testing_df, 'testing')

        return training_df.fillna(0), testing_df.fillna(0)
Exemple #10
0
def _save_inst_of_type_edge(obj_ver_key, info_tup):
    """Save the ws_obj_instance_of_type of edge."""
    from_id = 'ws_object_version/' + obj_ver_key
    obj_type = info_tup[2]
    to_id = 'ws_type_version/' + obj_type
    logger.debug(
        f'Saving ws_obj_instance_of_type edge from {from_id} to {to_id}')
    save('ws_obj_instance_of_type', [{'_from': from_id, '_to': to_id}])
Exemple #11
0
def search_workspace(params, meta):
    start = time.time()
    params = convert_params.search_workspace(params, meta)
    result = search(params, meta)
    result = convert_result.search_workspace(result, params, meta)
    logger.debug(
        f"Finished 'search_workspace' method in {time.time() - start}s")
    return result
Exemple #12
0
def search_types(params, meta):
    if isinstance(params, list) and len(params) == 1:
        params = params[0]
    start = time.time()
    query = convert_params.search_types(params)
    search_result = trap_error(lambda: search(query, meta))
    result = convert_result.search_types(search_result)
    logger.debug(f'Finished search_types in {time.time() - start}s')
    return [result]
Exemple #13
0
def main(obj_data, ws_info, obj_data_v1, conf):
    """
    Index a narrative object on save.
    We index the latest narratives for:
        - title and author
        - cell content
        - object names and types
        - created and updated dates
        - total number of cells
    """
    # Reference for the workspace info type:
    #    https://kbase.us/services/ws/docs/Workspace.html#typedefWorkspace.workspace_info
    # Reference for the object info type:
    #    https://kbase.us/services/ws/docs/Workspace.html#typedefWorkspace.object_info
    obj_info = obj_data['info']
    obj_id = obj_info[0]
    obj_metadata = obj_info[-1]
    if not obj_metadata:
        raise RuntimeError(
            f"Cannot index narrative: no metadata for the narrative object. Obj info: {obj_info}"
        )
    [ws_id, _, owner, moddate, _, _, _, _, ws_metadata] = ws_info
    if not ws_metadata:
        raise RuntimeError(
            f"Cannot index narrative: no metadata for the workspace. WS info: {ws_info}"
        )
    if ws_metadata.get('is_temporary') == 'true':
        logger.debug("Skipping narrative indexing because it is temporary")
        return
    is_narratorial = _narrative_is_narratorial(ws_metadata)
    narrative_title = obj_metadata.get('name')
    creator = obj_data['creator']
    # Get all the types and names of objects in the narrative's workspace.
    narrative_data_objects = _fetch_objects_in_workspace(ws_id)
    # Extract all the data we want to index from the notebook cells
    raw_cells = obj_data['data'].get('cells', [])
    index_cells = _extract_cells(raw_cells, ws_id)
    result = {
        '_action': 'index',
        'doc': {
            'narrative_title': narrative_title,
            'is_narratorial': is_narratorial,
            'data_objects': narrative_data_objects,
            'owner': owner,
            'modified_at': ts_to_epoch(moddate),
            'cells': index_cells,
            'creator': creator,
            'total_cells': len(raw_cells),
            'static_narrative_saved':
            ws_metadata.get('static_narrative_saved'),
            'static_narrative_ref': ws_metadata.get('static_narrative'),
        },
        'index': conf['index_name'],
        'id': f"{conf['namespace']}::{ws_id}:{obj_id}",
    }
    yield result
Exemple #14
0
def get_objects(params, meta):
    # KBase convention is to wrap params in an array
    if isinstance(params, list) and len(params) == 1:
        params = params[0]
    start = time.time()
    query = convert_params.get_objects(params)
    search_result = trap_error(lambda: search(query, meta))
    result = convert_result.get_objects(params, search_result, meta)
    logger.debug(f'Finished get_objects in {time.time() - start}s')
    return [result]
Exemple #15
0
def _save_created_with_module_edge(obj_ver_key, prov):
    """Save the ws_obj_created_with_module edge."""
    if not prov or not prov[0] or not prov[0].get('service'):
        return
    module_key = get_module_key_from_prov(prov)
    from_id = 'ws_object_version/' + obj_ver_key
    to_id = 'ws_module_version/' + module_key
    logger.debug(
        f'Saving ws_obj_created_with_module edge from {from_id} to {to_id}')
    save('ws_obj_created_with_module', [{'_from': from_id, '_to': to_id}])
Exemple #16
0
def _save_prov_desc_edge(obj_ver_key, obj):
    """Save the ws_prov_descendant_of edge."""
    prov = obj.get('provenance')
    if not prov:
        return
    input_objs = prov[0].get('input_ws_objects', [])
    from_id = 'ws_object_version/' + obj_ver_key
    for upa in input_objs:
        to_id = 'ws_object_version/' + upa.replace('/', ':')
        logger.debug(
            f'Saving ws_prov_descendant_of edge from {from_id} to {to_id}')
        save('ws_prov_descendant_of', [{'_from': from_id, '_to': to_id}])
Exemple #17
0
def _save_copy_edge(obj_ver_key, obj):
    """Save ws_copied_from document."""
    copy_ref = obj.get('copied')
    if not copy_ref:
        logger.debug('Not a copied object.')
        return
    copied_key = copy_ref.replace('/', ':')
    from_id = 'ws_object_version/' + obj_ver_key
    to_id = 'ws_object_version/' + copied_key
    logger.debug(f'Saving ws_copied_from edge from {from_id} to {to_id}')
    # "The _from object is a copy of the _to object
    save('ws_copied_from', [{'_from': from_id, '_to': to_id}])
Exemple #18
0
def _save_ws_object(obj_info, ws_info):
    """Runs at most every 300 seconds; otherwise a no-op."""
    wsid = obj_info[6]
    objid = obj_info[0]
    key = f"{wsid}:{objid}"
    logger.debug(f'Saving ws_object with key {key}')
    save('ws_object', [{
        '_key': key,
        'workspace_id': wsid,
        'object_id': objid,
        'is_public': ws_info[6] == 'r',
        'deleted': False
    }])
Exemple #19
0
def _reindex_narrative(obj, ws_info: dict) -> None:
    obj_type = obj['info'][2]
    if 'Narrative' in obj_type:
        return
    meta = ws_info[-1]
    if not isinstance(meta, dict) or meta.get('narrative') != '1':
        logger.debug("This workspace is not a narrative")
        return
    wsid = ws_info[0]
    narr_info = config()['ws_client'].find_narrative(wsid, admin=True)
    objid = narr_info[0]
    # Publish an event to reindex the narrative
    ev = {'evtype': 'REINDEX', 'wsid': wsid, 'objid': objid}
    kafka.produce(ev, callback=_delivery_report)
Exemple #20
0
def _wait_for_service(url, name, start_time, timeout, params=None):
    while True:
        try:
            logger.info(f'Waiting for {name} service...')
            requests.get(url, params=params).raise_for_status()
            logger.info(f'{name} is up!')
            break
        except Exception as err:
            logger.debug(f'Unable to connect to {name}: {err}')
            time.sleep(5)
            if (int(time.time()) - start_time) > timeout:
                raise RuntimeError(
                    f"Failed to connect to all services in {timeout}s. Timed out on {name}."
                )
    def get_embedding(self, nodes):
        embs = []
        emb_weight = self.emb_weights(nodes[:, 0])
        emb_weight_norm = self.emb_weights_softmax(emb_weight)

        for i in range(nodes.shape[1]):
            logger.debug('center i: {}'.format(i))
            embs.append(self.center_embeddings[i](nodes[:, i]))
        emb_stack = torch.stack(embs)
        embs_weighted = emb_stack * emb_weight_norm.T.unsqueeze(2).expand_as(
            emb_stack)
        emb = torch.sum(embs_weighted, axis=0)

        return emb
Exemple #22
0
def _save_created_with_method_edge(obj_ver_key, prov):
    """Save the ws_obj_created_with_method edge."""
    if not prov or not prov[0] or not prov[0].get('service'):
        return
    method_key = get_method_key_from_prov(prov)
    from_id = 'ws_object_version/' + obj_ver_key
    to_id = 'ws_method_version/' + method_key
    params = prov[0].get('method_params')
    logger.debug(
        f'Saving ws_obj_created_with_method edge from {from_id} to {to_id}')
    save('ws_obj_created_with_method', [{
        '_from': from_id,
        '_to': to_id,
        'method_params': params
    }])
Exemple #23
0
    def __build_tree__(self, features: np.array, labels: np.array, depth: int = 0) -> Node:
        """Build decision tree that learns split functions.

        Args:
            features:
            labels:
            depth:

        Returns:
            Decision tree root node
        """
        n_percentiles = max(2, int(len(labels) / 10))

        if self.check_stopping_condition(labels, depth):
            prob = self.get_probability(labels)
            return Node(None, None, None, prob)  # type: ignore

        else:
            logger.debug('Features: {}'.format(features))
            logger.debug('Labels: {}'.format(labels))
            splits = np.percentile(features, self.get_percentile_list(n_percentiles), axis=0)

            best_split = None
            best_split_feat_idx = None
            best_split_gini_gain = float('-inf')

            for feat_idx, feat_col in enumerate(features.T):  # Transpose to loop through columns
                logger.debug('Col index: {}'.format(feat_idx))

                for split in splits[:, feat_idx]:
                    labels_left = labels[np.where(feat_col < split)]
                    labels_right = labels[np.where(feat_col >= split)]

                    gain = gini_gain(labels, [labels_left, labels_right])

                    if gain > best_split_gini_gain:
                        best_split_gini_gain, best_split, best_split_feat_idx = gain, split, feat_idx

            split_left = np.where(features[:, best_split_feat_idx] < best_split)
            split_right = np.where(features[:, best_split_feat_idx] >= best_split)
            logger.debug('Split left: {} | right: {}'.format(split_left, split_right))

            features_left, features_right = features[split_left], features[split_right]
            labels_left, labels_right = labels[split_left], labels[split_right]

            # If either node is empty after splitting
            if len(labels_left) == 0:
                node_left = Node(None, None, None, self.get_probability(labels))  # type: ignore
                node_right = self.__build_tree__(features_right, labels_right, depth + 1)
            elif len(labels_right) == 0:  # pragma: no cover
                node_left = self.__build_tree__(features_left, labels_left, depth + 1)
                node_right = Node(None, None, None, self.get_probability(labels))  # type: ignore
            else:
                node_left = self.__build_tree__(features_left, labels_left, depth + 1)
                node_right = self.__build_tree__(features_right, labels_right, depth + 1)

        return Node(node_left, node_right, lambda feature: feature[best_split_feat_idx] < best_split)
Exemple #24
0
    def predict(self, features: np.array) -> np.array:
        """Returns labels given a set of features.

        Args:
            features: Numpy array of features with shape (row x col)

        Returns:
            Predicted labels
        """
        labels_list = []

        for tree, col_idx in zip(self.trees, self.col_idxs):
            logger.debug('Col index: {}'.format(col_idx))
            features_subsampled = features[:, col_idx]
            labels_list.append(
                [tree.split(row) for row in features_subsampled])

        labels = np.array(labels_list).mean(axis=0)

        return labels
Exemple #25
0
 def craw_open(self):
     for c in self.sources:
         logger.debug(c["city"] + ": " + c["url"])
         arr = get_json(c["url"])
         if arr and arr["data"]:
             for item in arr["data"]:
                 if not item["avg_unit_price"]:
                     item["avg_unit_price"] = -1.0
                 if not item["name"] or not["house_count"]:
                     logger.debug(item)
                     continue
                 row = {"city": c["city"], "district": item["name"].encode("utf-8"),
                        "total": item["house_count"],
                        "price": item["avg_unit_price"], "date": self.date}
                 old = self.dao.get_item(row["city"], row["district"], row["date"])
                 if not old:
                     self.dao.insert_item(row)
                 elif old[3] < row["total"]:
                     self.dao.update_item(row["city"], row["district"], row["total"],
                                     row["price"], row["date"])
Exemple #26
0
def _save_obj_version(key, ver, info_tup, ws_info):
    objid = info_tup[0]
    wsid = info_tup[6]
    obj_name = info_tup[1]
    hsh = info_tup[8]
    size = info_tup[9]
    epoch = ts_to_epoch(info_tup[3])
    logger.debug(f"Saving ws_object_version with key {key}")
    save('ws_object_version', [{
        '_key': key,
        'workspace_id': wsid,
        'object_id': objid,
        'version': ver,
        'name': obj_name,
        'hash': hsh,
        'size': size,
        'epoch': epoch,
        'deleted': False,
        'is_public': ws_info[6] == 'r',
    }])
Exemple #27
0
def _save_workspace(ws_info):
    """Save the ws_workspace vertex given an object info tuple."""
    wsid = ws_info[0]
    # Workspace info tuple is as follows:
    #    0  1    2     3       4        5          6          7        8
    #   [id,name,owner,moddate,maxobjid,user_perms,globalread,lockstat,metadata]
    metadata = ws_info[-1]
    logger.debug(f'Saving workspace vertex {wsid}')
    save(
        'ws_workspace', {
            '_key': str(wsid),
            'narr_name': metadata.get('narrative_nice_name', ''),
            'owner': ws_info[2],
            'max_obj_id': ws_info[4],
            'lock_status': ws_info[7],
            'name': ws_info[1],
            'mod_epoch': ts_to_epoch(ws_info[3]),
            'is_public': ws_info[6] == 'r',
            'is_deleted': False,
            'metadata': metadata
        })
Exemple #28
0
 def get_wangqian(self):
     html = download(self.url, charset="utf-8")
     soup = BeautifulSoup(html, "html.parser")
     total_div = soup.find("span", {"id": "ess_ctr5112_FDCJY_SignOnlineStatistics_totalCount4"})
     zhuzai_div = soup.find("span", {"id": "ess_ctr5112_FDCJY_SignOnlineStatistics_residenceCount4"})
     date_div = soup.find("span", {"id": "ess_ctr5115_FDCJY_HouseTransactionStatist_timeMark4"})
     if total_div and zhuzai_div:
         total = total_div.text.encode("utf-8")
         zhuzai = zhuzai_div.text.encode("utf-8")
         date = date_div.text.strip().encode("utf-8")
         tmp = date.split("-")
         if len(tmp) == 3:
             date = datetime.today().replace(year=int(tmp[0]), month=int(tmp[1]), day=int(tmp[2]))
         else:
             logger.error("beijing gov get wangqian.")
         row = {"city": "北京", "district": "bj", "zhuzai": zhuzai,
                "total": total, "date": date}
         has = self.dao.has_item("bj", date)
         if not has[0]:
             logger.debug(row)
             self.dao.insert_item(row)
Exemple #29
0
def _save_type_vertices(obj_info):
    """Save associated vertices for an object type."""
    obj_type = sanitize_arangodb_key(obj_info[2])
    (type_module, type_name, type_ver) = get_type_pieces(obj_type)
    (maj_ver, min_ver) = [int(v) for v in type_ver.split('.')]
    logger.debug(
        f'Saving ws_type_version, ws_type, and ws_type_module for {obj_type}')
    save(
        'ws_type_version', {
            '_key': obj_type,
            'type_name': type_name,
            'module_name': type_module,
            'maj_ver': maj_ver,
            'min_ver': min_ver
        })
    save(
        'ws_type', {
            '_key': f'{type_module}.{type_name}',
            'type_name': type_name,
            'module_name': type_module
        })
    save('ws_type_module', {'_key': type_module})
    def forward(self, centers, contexts, neg_contexts):
        """

        Args:
            center: List of center words
            context: List of context words
            neg_context: List of list of negative context words

        Returns:

        """
        # Calculate positive score
        emb_centers = []
        for i in range(centers.shape[1]):
            logger.debug('center i: {}'.format(i))
            emb_centers.append(self.center_embeddings[i](centers[:, i]))
        emb_center = torch.mean(torch.stack(emb_centers), axis=0)

        emb_contexts = []
        for i in range(contexts.shape[1]):
            logger.debug('context i: {}'.format(i))
            emb_contexts.append(self.context_embeddings[i](contexts[:, i]))
        emb_context = torch.mean(torch.stack(emb_contexts), axis=0)

        emb_neg_contexts = []
        neg_contexts = neg_contexts.view(-1, len(self.context_embeddings))
        for i in range(neg_contexts.shape[1]):
            logger.debug('neg context i: {}, {}'.format(i, neg_contexts[:, i]))
            emb_neg_contexts.append(self.context_embeddings[i](
                neg_contexts[:, i]))
        emb_neg_context = torch.mean(torch.stack(emb_neg_contexts), axis=0)

        # Next two lines equivalent to torch.dot(emb_center, emb_context) but for batch
        score = torch.mul(emb_center, emb_context)  # Get dot product (part 1)
        score = torch.sum(score, dim=1)  # Get dot product (part2)
        score = torch.clamp(score, max=10, min=-10)
        score = -F.logsigmoid(score)  # Get score for the positive pairs

        # Calculate negative score (for negative samples)
        neg_score = torch.bmm(
            emb_neg_context.view(emb_center.shape[0], -1, emb_center.shape[1]),
            emb_center.unsqueeze(2)).squeeze()  # Get dot product
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)

        # Return combined score
        return torch.mean(score + neg_score)
Exemple #31
0
def index_obj(obj_data, ws_info, msg_data):
    """
    For a newly created object, generate the index document for it and push to
    the elasticsearch topic on Kafka.
    Args:
        obj_data - in-memory parsed data from the workspace object
        msg_data - json event data received from the kafka workspace events
            stream. Must have keys for `wsid` and `objid`
    """
    obj_type = obj_data['info'][2]
    (type_module, type_name, type_version) = ws_utils.get_type_pieces(obj_type)
    if (type_module + '.' +
            type_name) in config()['global']['ws_type_blacklist']:
        # Blacklisted type, so we don't index it
        return
    # check if this particular object has the tag "noindex"
    metadata = ws_info[-1]
    # If the workspace's object metadata contains a "nosearch" tag, skip it
    if metadata.get('searchtags'):
        if 'noindex' in metadata['searchtags']:
            return
    # Get the info of the first object to get the creation date of the object.
    upa = get_upa_from_msg_data(msg_data)
    try:
        obj_data_v1 = config()['ws_client'].admin_req(
            'getObjects', {
                'objects': [{
                    'ref': upa + '/1'
                }],
                'no_data': 1
            })
    except WorkspaceResponseError as err:
        ws_utils.log_error(err)
        raise err
    obj_data_v1 = obj_data_v1['data'][0]
    # Dispatch to a specific type handler to produce the search document
    (indexer, conf) = _find_indexer(type_module, type_name, type_version)
    # All indexers are generators that yield document data for ES.
    defaults = indexer_utils.default_fields(obj_data, ws_info, obj_data_v1)
    for indexer_ret in indexer(obj_data, ws_info, obj_data_v1, conf):
        if indexer_ret['_action'] == 'index':
            allow_indices = config()['allow_indices']
            skip_indices = config()['skip_indices']
            if allow_indices is not None and indexer_ret.get(
                    'index') not in allow_indices:
                # This index name is not in the indexing whitelist from the config, so we skip
                logger.debug(
                    f"Index '{indexer_ret['index']}' is not in ALLOW_INDICES, skipping"
                )
                continue
            if skip_indices is not None and indexer_ret.get(
                    'index') in skip_indices:
                # This index name is in the indexing blacklist in the config, so we skip
                logger.debug(
                    f"Index '{indexer_ret['index']}' is in SKIP_INDICES, skipping"
                )
                continue
            if '_no_defaults' not in indexer_ret:
                # Inject all default fields into the index document.
                indexer_ret = indexer_utils.merge_default_fields(
                    indexer_ret, defaults)
        yield indexer_ret
Exemple #32
0
def search_objects(params, meta):
    start = time.time()
    result = search(params, meta)
    logger.debug(f"Finished 'search_objects' method in {time.time() - start}s")
    return result
Exemple #33
0
def static_file(path):
    logger.debug(path)
    return message_app.send_static_file(path)
if __name__ == '__main__':
    # EXTRACTING
    label_column = 'revenue'
    original_training_df = tools.get_df_from_csv(
        constants.path_training_file)[[constants.label_column] +
                                      constants.columns_to_process]
    original_testing_df = tools.get_df_from_csv(
        constants.path_testing_file)[constants.columns_to_process]

    # TRANSFORMING
    pipeline_transforming = PipelineTransforming(original_training_df,
                                                 original_testing_df)
    training_df, testing_df = pipeline_transforming.clean_dfs()

    logger.debug(f'Training shape: {training_df.shape}')
    logger.debug(f'Testing shape: {testing_df.shape}')
    logger.debug(f'Training columns: {training_df.columns}')

    # TRANSFORMING AND LOADING
    if mode == 'cross_validate_model':
        rf = RandomForestRegressor(**parameters_rf)  # model
        pipeline_loading.cross_validate_model(training_df, rf)
    elif mode == 'tune_hyperparameters_grid_search_cv':
        pipeline_loading.tune_hyperparameters_by_grid_search_cv(training_df)
    elif mode == 'tune_hyperparameters_randomized_search_cv':
        pipeline_loading.tune_hyperparameters_by_randomized_search_cv(
            training_df)
    elif mode == 'produce_submission_result':
        rf = RandomForestRegressor(**parameters_rf)  # model
        pipeline_loading.produce_submission_result(training_df, testing_df, rf)