Esempio n. 1
0
def load_broadinstitute_exac(data_folder):
    t0 = time.time()
    exacs = load_broadinstitute_exac_all(data_folder)
    for k,v in load_broadinstitute_exac_nontcga(data_folder).items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k,v in load_broadinstitute_exac_nonpsych(data_folder).items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    from ..ensembl.parser import EnsemblParser
    from biothings.utils.hub_db import get_src_dump
    ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {}
    ensembl_dir = ensembl_doc.get("data_folder")
    assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)"
    ensembl_parser = EnsemblParser(ensembl_dir)
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    return exacs
Esempio n. 2
0
 def source_info(self, source=None):
     src_dump = get_src_dump()
     src_ids = list(self.register.keys())
     if source:
         if source in src_ids:
             src_ids = [source]
         else:
             return None
     res = []
     for _id in src_ids:
         src = src_dump.find_one({"_id": _id}) or {}
         assert len(
             self.register[_id]
         ) == 1, "Found more than one dumper for source '%s': %s" % (
             _id, self.register[_id])
         dumper = self.register[_id][0]
         src.setdefault("download", {})
         src["download"]["dumper"] = {
             "name":
             "%s.%s" %
             (inspect.getmodule(dumper).__name__, dumper.__name__),
             "bases": [
                 "%s.%s" % (inspect.getmodule(k).__name__, k.__name__)
                 for k in dumper.__bases__
             ],
             "manual":
             issubclass(dumper, ManualDumper),
         }
         src["name"] = _id
         src["_id"] = _id
         res.append(src)
     if source:
         return res.pop()
     else:
         return res
def main(confirm=True):
    src_dump = get_src_dump()
    ensembl_doc = src_dump.find_one({"_id": "ensembl"}) or {}
    ENSEMBL_DATA_FOLDER = ensembl_doc.get("data_folder")
    assert ENSEMBL_DATA_FOLDER, "Can't find Ensembl data folder"
    entrez_doc = src_dump.find_one({"_id": "entrez"}) or {}
    ENTREZ_DATA_FOLDER = entrez_doc.get("data_folder")
    assert ENTREZ_DATA_FOLDER, "Can't find Entrez data folder"

    gene_ensembl_1_xref_dm_file = os.path.join(
        ENSEMBL_DATA_FOLDER, "gene_ensembl__xref_entrezgene__dm.txt")
    gene_ensembl_2_main_file = os.path.join(ENSEMBL_DATA_FOLDER,
                                            "gene_ensembl__gene__main.txt")
    gene2ensembl_file = os.path.join(ENTREZ_DATA_FOLDER, "gene2ensembl.gz")
    gene_main_file = os.path.join(ENTREZ_DATA_FOLDER, "gene_info.gz")

    outfile = os.path.join(ENSEMBL_DATA_FOLDER,
                           "gene_ensembl__gene__extra.txt")

    multi_mapping_dict, total_ensembl_IDs = find_multiple_mappings_from_entrezgene_file(
        gene_ensembl_1_xref_dm_file)
    ensembl_dict = create_ensembl_gene_id_dict(gene_ensembl_2_main_file,
                                               multi_mapping_dict)
    ensembl_dict, ensembl_match_count = find_ncbi_ids_from_gene2ensembl(
        ensembl_dict, gene2ensembl_file)
    ncbi_id_symbols = find_ncbi_symbols(gene_main_file, ensembl_dict)
    mapping_generator = merge_mapping(ensembl_dict,
                                      ncbi_id_symbols,
                                      add_source=False)
    total_mapped = write_mapping_file(mapping_generator,
                                      outfile,
                                      confirm=confirm)
    run_stats(total_ensembl_IDs, ensembl_dict, ensembl_match_count,
              total_mapped)
Esempio n. 4
0
 def prepare_src_dump(self):
     src_dump = get_src_dump()
     # just populate/initiate an src_dump record (b/c no dump before) if needed
     self.src_doc = src_dump.find_one({'_id': self.main_source})
     if not self.src_doc:
         src_dump.save({"_id": self.main_source})
         self.src_doc = src_dump.find_one({'_id': self.main_source})
     return src_dump
Esempio n. 5
0
    def export_mapping(self, plugin_name, folder):
        res = {
            "mapping": {
                "status": None,
                "file": None,
                "message": None,
                "origin": None
            }
        }
        # first check if plugin defines a custom mapping in manifest
        # if that's the case, we don't need to export mapping there
        # as it'll be exported with "uploader" code
        plugindoc = get_data_plugin().find_one({"_id": plugin_name})
        assert plugindoc, "Can't find plugin named '%s'" % plugin_name
        plugin_folder = plugindoc.get("download", {}).get("data_folder")
        assert plugin_folder, "Can't find plugin folder for '%s'" % plugin_name
        try:
            manifest = json.load(
                open(os.path.join(plugin_folder, "manifest.json")))
            if "mapping" in manifest.get("uploader", {}):
                res["mapping"][
                    "message"] = "Custom mapping included in uploader export"
                res["mapping"]["status"] = "warning"
                res["mapping"]["origin"] = "custom"
                return res
        except Exception as e:
            self.logger.error("Can't read manifest while exporting code: %s" %
                              e)
        # try to export mapping from src_master (official)
        doc = get_src_master().find_one({"_id": plugin_name})
        if doc:
            mapping = doc.get("mapping")
            res["mapping"]["origin"] = "registered"
        else:
            doc = get_src_dump().find_one({"_id": plugin_name})
            mapping = doc and doc.get("inspect", {}).get("jobs", {}).get(plugin_name, {}).get("inspect", {}).\
                get("results", {}).get("mapping")
            res["mapping"]["origin"] = "inspection"
        if not mapping:
            res["mapping"]["origin"] = None
            res["mapping"]["status"] = "warning"
            res["mapping"][
                "message"] = "Can't find registered or generated (inspection) mapping"
            return res
        else:
            ufile = os.path.join(folder, "upload.py")
            strmap, _ = yapf_api.FormatCode(pprint.pformat(mapping))
            with open(ufile, "a") as fout:
                fout.write("""
    @classmethod
    def get_mapping(klass):
        return %s\n""" % textwrap.indent((strmap), prefix="    " * 2))

        res["mapping"]["file"] = ufile
        res["mapping"]["status"] = "ok"

        return res
Esempio n. 6
0
 def clean_stale_status(self):
     src_dump = get_src_dump()
     srcs = src_dump.find()
     for src in srcs:
         jobs = src.get("inspect", {}).get("jobs", {})
         for subsrc in jobs:
             if jobs[subsrc].get("status") == "inspecting":
                 logging.warning(
                     "Found stale datasource '%s', marking inspect status as 'canceled'"
                     % src["_id"])
                 jobs[subsrc]["status"] = "canceled"
         src_dump.replace_one({"_id": src["_id"]}, src)
Esempio n. 7
0
 def __init__(self, source_list, dump_manager, upload_manager,
              data_plugin_manager):
     self._orig_source_list = source_list
     self.source_list = None
     self.dump_manager = dump_manager
     self.upload_manager = upload_manager
     self.data_plugin_manager = data_plugin_manager
     self.reload()
     self.src_master = get_src_master()
     self.src_dump = get_src_dump()
     # honoring BaseSourceManager interface (gloups...-
     self.register = {}
Esempio n. 8
0
 def create_todump_list(self, force=False):
     uni_doc = get_src_dump().find_one({"_id":UniprotDumper.SRC_NAME}) or {}
     if uni_doc:
         remotefile = os.path.join(uni_doc["download"]["data_folder"],self.__class__.UNIPROT_FILE)
         if not os.path.exists(remotefile):
             self.logger.warning("File '%s' doesn't exist (yet?)" % self.__class__.UNIPROT_FILE)
             return
         self.release = uni_doc["download"]["release"]
         localfile = os.path.join(self.current_data_folder,self.__class__.UNIPROT_FILE)
         if force or not os.path.exists(localfile) or self.remote_is_better(remotefile,localfile):
             self.to_dump.append({"remote":remotefile,"local":localfile})
     else:
         self.logger.error("Dependent uniprot datasource has not been loaded (not src_dump doc)")
Esempio n. 9
0
    def load(self, aslist=False):
        '''
        loading ncbi "homologene.data" file
        adding "homologene" field in gene doc
        '''
        from biothings.utils.hub_db import get_src_dump
        homo_d = tab2dict(self.datafile,(2,1),0,header=0)
        entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {}
        entrez_dir = entrez_doc.get("data_folder")
        assert entrez_dir, "Can't find Entez data directory"
        DATAFILE = os.path.join(entrez_dir, 'gene_history.gz')
        assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir
        retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
        for id in list(homo_d.keys()):
            homo_d[retired2gene.get(id,id)] = homo_d[id]

        with open(self.datafile) as df:
            homologene_d = {}
            doc_li = []
            print()
            geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d)

            for line in df:
                ld = line.strip().split('\t')
                hm_id, tax_id, geneid = [int(x) for x in ld[:3]]
                if (self.taxid_set is None or tax_id in self.taxid_set) and geneid in geneid_d:
                    # for selected species only
                    # and also ignore those geneid does not match any
                    # existing gene doc
                    # in case of orignal geneid is retired, replaced with the
                    # new one, if available.
                    geneid = geneid_d[geneid]
                    genes = homologene_d.get(hm_id, [])
                    genes.append((tax_id, geneid))
                    homologene_d[hm_id] = genes

                    doc_li.append(dict(_id=str(geneid), taxid=tax_id,
                                       homologene={'id': hm_id}))

            for i, gdoc in enumerate(doc_li):
                gdoc['homologene']['genes'] = self._sorted_homologenes(
                    set(homologene_d[gdoc['homologene']['id']]))
                doc_li[i] = gdoc

        if aslist:
            return doc_li
        else:
            gene_d = dict([(d['_id'], d) for d in doc_li])
            return gene_d
Esempio n. 10
0
    def create_todump_list(self, force=False):
        self.from_src = get_src_dump().find_one(
            {"_id": self.__class__.FROM_SOURCE["name"]})
        self.to_src = get_src_dump().find_one(
            {"_id": self.__class__.TO_SOURCE["name"]})
        from_folder = self.from_src.get("download", {}).get("data_folder")
        to_folder = self.to_src.get("download", {}).get("data_folder")
        assert from_folder, "Couldn't find folder for source %s (tried '%s')" % (
            self.from_src, from_folder)
        assert to_folder, "Couldn't find folder for source %s (tried '%s')" % (
            self.to_src, to_folder)

        self.set_release()  # so we can generate new_data_folder
        for attr, folder in [("FROM_SOURCE", from_folder),
                             ("TO_SOURCE", to_folder)]:
            files = getattr(self.__class__, attr, {}).get("files")
            assert files, "No files specified in %s" % attr
            for one_file in files:
                remote_file = os.path.join(folder, one_file)
                assert os.path.exists(
                    remote_file), "Remote file '%s' doesn't exist in %s" % (
                        remote_file, attr)
                new_localfile = os.path.join(self.new_data_folder, one_file)
                current_localfile = os.path.join(self.current_data_folder,
                                                 one_file)
                try:
                    remote_better = self.remote_is_better(
                        remote_file, current_localfile)
                except FileNotFoundError:
                    # no local file, we want the remote
                    remote_better = True
                if force or current_localfile is None or remote_better:
                    self.to_dump.append({
                        "remote": remote_file,
                        "local": new_localfile
                    })
Esempio n. 11
0
 def clean_stale_status(self):
     src_dump = get_src_dump()
     srcs = src_dump.find()
     for src in srcs:
         jobs = src.get("upload", {}).get("jobs", {})
         dirty = False
         for subsrc in jobs:
             if jobs[subsrc].get("status") == "uploading":
                 logging.warning(
                     "Found stale datasource '%s', marking upload status as 'canceled'"
                     % src["_id"])
                 jobs[subsrc]["status"] = "canceled"
                 dirty = True
         if dirty:
             src_dump.replace_one({"_id": src["_id"]}, src)
Esempio n. 12
0
    def load_data(self, data_folder):
        """
        Loads gene data from NCBI's refseq2gene.gz file.
        Parses it based on genomic position data and refseq status provided by the
        list of taxids from get_ref_microbe_taxids() as lookup table
        :return:
        """

        refsrc = get_src_dump().find_one({"_id":"ref_microbe_taxids"})
        assert refsrc, "ref_microbe_taxids dump not found"
        taxids_file = os.path.join(refsrc["download"]["data_folder"], "ref_microbe_taxids.pyobj")
        datafile = os.path.join(data_folder, 'gene2refseq.gz')

        taxids = loadobj(taxids_file)
        taxid_set = set(taxids)

        def _includefn(ld):
            return ld[0] in taxid_set  # match taxid from taxid_set

        cols_included = [0, 1, 7, 9, 10, 11]  # 0-based col idx
        gene2genomic_pos_li = tab2list(datafile, cols_included, header=1,
                                       includefn=_includefn)
        count = 0
        last_id = None
        for gene in gene2genomic_pos_li:
            count += 1
            strand = 1 if gene[5] == '+' else -1
            _id = gene[1]

            mgi_dict = {
                '_id': _id,
                'genomic_pos': {
                    'entrezgene': _id,
                    'start': int(gene[3]),
                    'end': int(gene[4]),
                    'chr': gene[2],
                    'strand': strand
                }
            }
            if _id != last_id:
                # rows with dup _id will be skipped
                yield mgi_dict
            last_id = _id
Esempio n. 13
0
 def source_info(self, source=None):
     src_dump = get_src_dump()
     src_ids = list(self.register.keys())
     if source:
         if source in src_ids:
             src_ids = [source]
         else:
             return None
     res = []
     cur = src_dump.find({"_id": {"$in": src_ids}})
     bysrcs = {}
     [bysrcs.setdefault(src["_id"], src) for src in cur]
     for _id in src_ids:
         src = bysrcs.get(_id, {})
         uploaders = self.register[_id]
         src.setdefault("upload", {})
         for uploader in uploaders:
             upl = {
                 "name":
                 "%s.%s" %
                 (inspect.getmodule(uploader).__name__, uploader.__name__),
                 "bases": [
                     "%s.%s" % (inspect.getmodule(k).__name__, k.__name__)
                     for k in uploader.__bases__
                 ],
                 "dummy":
                 issubclass(uploader, DummySourceUploader),
             }
             src["upload"].setdefault("jobs",
                                      {}).setdefault(uploader.name, {})
             src["upload"]["jobs"][uploader.name]["uploader"] = upl
         src["name"] = _id
         src["_id"] = _id
         res.append(src)
     if source:
         if res:
             return res.pop()
         else:
             # no information, just return what was passed to honor return type
             # + minimal information
             return {"name": source, "_id": source}
     else:
         return res
Esempio n. 14
0
    def export_mapping(self, plugin_name, folder):
        res = {
            "mapping": {
                "status": None,
                "file": None,
                "message": None,
                "origin": None
            }
        }
        # first try to export mapping from src_master (official)
        doc = get_src_master().find_one({"_id": plugin_name})
        if doc:
            mapping = doc.get("mapping")
            res["mapping"]["origin"] = "registered"
        else:
            doc = get_src_dump().find_one({"_id": plugin_name})
            mapping = doc and doc.get("inspect",{}).get("jobs",{}).get(plugin_name,{}).get("inspect",{}).\
                          get("results",{}).get("mapping")
            res["mapping"]["origin"] = "inspection"
        if not mapping:
            res["mapping"]["origin"] = None
            res["mapping"]["status"] = "warning"
            res["mapping"][
                "message"] = "Can't find registered or generated (inspection) mapping"
            return res
        else:
            ufile = os.path.join(folder, "upload.py")
            strmap, _ = yapf_api.FormatCode(pprint.pformat(mapping))
            with open(ufile, "a") as fout:
                fout.write("""
    @classmethod
    def get_mapping(klass):
        return %s\n""" % textwrap.indent((strmap), prefix="    " * 2))

        res["mapping"]["file"] = ufile
        res["mapping"]["status"] = "ok"

        return res
Esempio n. 15
0
 def prepare_src_dump(self):
     # Mongo side
     self.src_dump = get_src_dump()
     self.src_doc = self.src_dump.find_one({'_id': self.src_name}) or {}
Esempio n. 16
0
def set_pending_to_upload(src_name):
    src_dump = get_src_dump()
    src_dump.update({"_id": src_name}, {"$addToSet": {"pending": "upload"}})
Esempio n. 17
0
def migrate_0dot1_to_0dot2():
    """
    mongodb src_dump/data_plugin changed:
        1. "data_folder" and "release" under "download"
        2. "data_folder" and "release" in upload.jobs[subsrc] taken from "download"
        3. no more "err" under "upload"
        4. no more "status" under "upload"
        5. "pending_to_upload" is now "pending": ["upload"]
    """
    src_dump = get_src_dump()
    data_plugin = get_data_plugin()
    for srccol in [src_dump, data_plugin]:
        logging.info("Converting collection %s" % srccol)
        srcs = [src for src in srccol.find()]
        wasdue = False
        for src in srcs:
            logging.info("\tConverting '%s'" % src["_id"])
            # 1.
            for field in ["data_folder", "release"]:
                if field in src:
                    logging.debug(
                        "\t\t%s: found '%s' in document, moving under 'download'"
                        % (src["_id"], field))
                    try:
                        src["download"][field] = src.pop(field)
                        wasdue = True
                    except KeyError as e:
                        logging.warning(
                            "\t\t%s: no such field '%s' found, skip it (error: %s)"
                            % (src["_id"], field, e))
            # 2.
            for subsrc_name in src.get("upload", {}).get("jobs", {}):
                for field in ["data_folder", "release"]:
                    if field not in src["upload"]["jobs"][subsrc_name]:
                        logging.debug(
                            "\t\t%s: no '%s' found in upload jobs, taking it from 'download' (or from root keys)"
                            % (src["_id"], field))
                        try:
                            src["upload"]["jobs"][subsrc_name][field] = src[
                                "download"][field]
                            wasdue = True
                        except KeyError:
                            try:
                                src["upload"]["jobs"][subsrc_name][
                                    field] = src[field]
                                wasdue = True
                            except KeyError:
                                logging.warning(
                                    "\t\t%s: no such field '%s' found, skip it"
                                    % (src["_id"], field))
            # 3. & 4.
            for field in ["err", "status"]:
                if field in src.get("upload", {}):
                    logging.debug("\t\t%s: removing '%s' key from 'upload'" %
                                  (src["_id"], field))
                    src["upload"].pop(field)
                    wasdue = True
            # 5.
            if "pending_to_upload" in src:
                logging.debug(
                    "\t%s: found 'pending_to_upload' field, moving to 'pending' list"
                    % src["_id"])
                src.pop("pending_to_upload")
                wasdue = True
                if "upload" not in src.get("pending", []):
                    src.setdefault("pending", []).append("upload")
            if wasdue:
                logging.info("\tFinishing converting document for '%s'" %
                             src["_id"])
                srccol.save(src)
            else:
                logging.info("\tDocument for '%s' already converted" %
                             src["_id"])
Esempio n. 18
0
 def prepare_src_dump(self):
     """Sync with src_dump collection, collection information (src_doc)
     Return src_dump collection"""
     src_dump = get_src_dump()
     self.src_doc = src_dump.find_one({'_id': self.main_source}) or {}
     return src_dump
Esempio n. 19
0
    def inspect(self,
                data_provider,
                mode="type",
                batch_size=10000,
                limit=None,
                sample=None,
                **kwargs):
        """
        Inspect given data provider:
        - backend definition, see bt.hub.dababuild.create_backend for
          supported format), eg "merged_collection" or ("src","clinvar")
        - or callable yielding documents
        Mode:
        - "type": will inspect and report type map found in data (internal/non-standard format)
        - "mapping": will inspect and return a map compatible for later
          ElasticSearch mapping generation (see bt.utils.es.generate_es_mapping)
        - "stats": will inspect and report types + different counts found in
          data, giving a detailed overview of the volumetry of each fields and sub-fields
        - "jsonschema", same as "type" but result is formatted as json-schema standard
        - limit: when set to an integer, will inspect only x documents.
        - sample: combined with limit, for each document, if random.random() <= sample (float),
          the document is inspected. This option allows to inspect only a sample of data.
        """
        # /!\ attention: this piece of code is critical and not easy to understand...
        # Depending on the source of data to inspect, this method will create an
        # uploader or a builder. These objects don't be behave the same while they
        # pass through pickle: uploader needs to be "unprepare()"ed so it can be
        # pickled (remove some db connection, socket), while builder must *not* be
        # unprepare() because it would reset the underlying target_name (the actual
        # target collection). Also, the way results and statuses are registered is
        # different for uploader and builder...
        # So, there are lots of "if", be careful if you want to modify that code.

        data_provider_type = None  # where to register results (if possible to do so)
        registerer_obj = None  # who should register result
        t0 = time.time()
        started_at = datetime.now().astimezone()
        self.logger.info("Inspecting data with mode %s and data_provider %s" %
                         (repr(mode), repr(data_provider)))
        if callable(data_provider):
            raise NotImplementedError("data_provider as callable untested...")
        else:
            if data_provider[0] == "src":
                data_provider_type = "source"
                # find src_dump doc
                # is it a full source name (dot notation) ?
                fullname = get_source_fullname(data_provider[1])
                if fullname:
                    # it's a dot-notation
                    src_name = fullname.split(".")[0]
                else:
                    # no subsource, full source name is the passed name
                    src_name = data_provider[1]
                    fullname = src_name
                doc = get_src_dump().find_one({"_id": src_name
                                               })  # query by main source
                if not doc:
                    raise InspectorError(
                        "Can't find document associated to '%s'" % src_name)
                # get an uploader instance (used to get the data if type is "uploader"
                # but also used to update status of the datasource via register_status()
                ups = self.upload_manager[
                    fullname]  # potentially using dot notation
                if len(ups) > 1:
                    # recursively call inspect(), collect and return corresponding tasks
                    self.logger.debug(
                        "Multiple uploaders found, running inspector for each of them: %s"
                        % ups)
                    res = []
                    for up in ups:
                        r = self.inspect((data_provider[0], "%s" % up.name),
                                         mode=mode,
                                         batch_size=batch_size,
                                         limit=limit,
                                         sample=sample,
                                         **kwargs)
                        res.append(r)
                    return res

                assert len(
                    ups
                ) == 1, "More than one uploader found for '%s', not supported (yet), use main_source.source notation" % data_provider[
                    1]
                # create uploader
                registerer_obj = self.upload_manager.create_instance(ups[0])
                backend_provider = data_provider
            else:
                try:
                    data_provider_type = "build"
                    registerer_obj = self.build_manager.get_builder(
                        data_provider)
                    backend_provider = data_provider
                except Exception as e:
                    raise InspectorError(
                        "Unable to create backend from '%s': %s" %
                        (repr(data_provider), e))

        got_error = None
        try:

            @asyncio.coroutine
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info(
                    "Running inspector on %s (type:%s,data_provider:%s)" %
                    (repr(data_provider), data_provider_type,
                     backend_provider))
                if sample is not None:
                    self.logger.info(
                        "Sample set to %s, inspect only a subset of data",
                        sample)
                if limit is None:
                    self.logger.info("Inspecting all the documents")
                else:
                    nonlocal batch_size
                    # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit
                    if batch_size > limit:
                        batch_size = limit
                    self.logger.info("Inspecting only %s documents", limit)
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                doccnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]

                converters, mode = btinspect.get_converters(mode)

                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    if sample is not None:
                        if random.random() > sample:
                            continue
                    cnt += 1
                    doccnt += batch_size
                    if limit and doccnt > limit:
                        break
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                # just potential converters
                btinspect.run_converters(inspected, converters)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)

                        # when inspecting with "stats" mode, we can get huge number but mongo
                        # can't store more than 2^64, make sure to get rid of big nums there
                        def clean_big_nums(k, v):
                            # TODO: same with float/double? seems mongo handles more there ?
                            if isinstance(v, int) and v > 2**64:
                                return k, math.nan
                            else:
                                return k, v

                        dict_traverse(_map, clean_big_nums)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error

            task = asyncio.ensure_future(do())
            return task
        except Exception as e:
            self.logger.error("Error while inspecting '%s': %s" %
                              (repr(data_provider), e))
            raise
Esempio n. 20
0
 def poll(self, state, func):
     super(UploaderManager, self).poll(state, func, col=get_src_dump())