def scan_files(project): log = task.logger conf = task.conf config = GlobalConfig(conf) paths = PathsConfig(config) projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects") project_id = project["id"] temp_path = project["temp_path"] project_path = project["path"] projdb_path = project["db"] assembly = project["assembly"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if assembly == "hg18": out_port = liftover_projects_port elif assembly == "hg19": out_port = projects_port else: raise Exception("Unexpected assembly: {0}".format(assembly)) #if os.path.exists(projdb_path): # log.warn("Variations database already created, skipping this step.") # out_port.send(project) # return if os.path.exists(projdb_path): os.remove(projdb_path) log.info("Creating variants database ...") projdb_tmp_path = make_temp_file(task, suffix=".db") log.debug(projdb_tmp_path) projdb = ProjectDb(projdb_tmp_path).create() data_path = config.data_path log.info("Loading genes ...") projdb.load_genes(paths.data_ensembl_genes_path()) log.info("Loading pathways ...") projdb.load_pathways( paths.data_kegg_def_path(), paths.data_kegg_ensg_map_path()) log.info("Parsing variants ...") for obj_name in project["storage_objects"]: log.info("Downloading {} ...".format(obj_name)) dst_path = os.path.join(project_path, "sources", os.path.basename(obj_name)) dst_dirname = os.path.dirname(dst_path) if not os.path.exists(dst_dirname): os.makedirs(dst_dirname) # TODO: do not copy the source file (do not specify dst_path) task.storage.get_object(obj_name).get_data(dst_path) for container_name, path, name, ext, f in archived_files(dst_path): fname = os.path.join(path, name + ext) if container_name is not None: source_name = "{0}:{1}".format(os.path.basename(container_name), fname) else: source_name = name + ext log.info("=> {0} ...".format(source_name)) sample_id = os.path.basename(name) if ext.lower() in _SUPPORTED_EXTENSIONS: parser_type = ext[1:] else: parser_type = "tab" parser = create_variants_parser(parser_type, f, source_name, sample_id) source_id = projdb.add_source(source_name) var_ids = set() for var in parser: for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num()) var_ids.add(var_id) for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) num_variants = len(var_ids) log.info(" {0} variants".format(num_variants)) if num_variants == 0: raise Exception("No variants found in source '{}'. " "Please check the documentation for the expected input for '{}' format.".format( source_name, parser.name)) projdb.commit() projdb.close() log.info("Copying variants database ...") log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path)) shutil.copy(projdb_tmp_path, projdb_path) remove_temp(task, projdb_tmp_path) out_port.send(project)