Ejemplo n.º 1
0
def main_pipeline_part_3(session: database.Session, sample, db_sequence_id):
    file_path = get_local_folder_for(virus.name, FileType.Annotations) + str(
        sample.internal_id()) + ".pickle"
    try:
        if not os.path.exists(file_path):
            annotations_and_nuc_variants = sequence_aligner(
                sample.internal_id(), reference_sequence,
                sample.nucleotide_sequence(), sc2_chromosome,
                sc2_annotations_file_path, sc2_snpeff_db_name)
            with open(file_path, mode='wb') as cache_file:
                pickle.dump(annotations_and_nuc_variants,
                            cache_file,
                            protocol=pickle.HIGHEST_PROTOCOL)
        else:
            with open(file_path, mode='rb') as cache_file:
                annotations_and_nuc_variants = pickle.load(cache_file)
        annotations, nuc_variants = annotations_and_nuc_variants
        for ann in annotations:
            vcm.create_annotation_and_amino_acid_variants(
                session, db_sequence_id, *ann)
        for nuc in nuc_variants:
            vcm.create_nuc_variants_and_impacts(session, db_sequence_id, nuc)
        stats_module.completed_sample(sample.primary_accession_number())
    except Exception as e:
        if str(e).endswith("sequence contains letters not in the alphabet"):
            logger.warning(
                f"sample {sample.primary_accession_number()} skipped because sequence contains letter not in "
                f"the alphabet")
        else:
            logger.exception(
                f'exception occurred during pipeline_part_3 of sample {sample.primary_accession_number()}. '
                f'Doing rollback of insertion of variants and annotations + deletion of cache'
            )
        remove_file(file_path)
        raise e
Ejemplo n.º 2
0
 def __init__(self):
     logger.info(
         f'importing virus {self.name} using NCBI SC2 for taxonomy data')
     # fetch taxonomy data from NCBI
     taxonomy_file_path = download_ncbi_taxonomy_as_xml(
         get_local_folder_for(source_name=self.name,
                              _type=FileType.TaxonomyData), self.taxon_id())
     try:
         self.tax_tree = etree.parse(
             taxonomy_file_path,
             parser=etree.XMLParser(remove_blank_text=True))
     except etree.XMLSyntaxError as e:  # happens on AWS if for some reason the downloaded file is corrupted
         remove_file(taxonomy_file_path)
         ncbi_sc2_taxonomy_dir = get_local_folder_for(
             source_name=ncbi_known_settings["sars_cov_2"]
             ["generated_dir_name"],
             _type=FileType.TaxonomyData)
         alternative_taxonomy_path = ncbi_sc2_taxonomy_dir + f"{ncbi_known_settings['sars_cov_2']['virus_taxon_id']}.xml"
         if os.path.exists(alternative_taxonomy_path):
             shutil.copyfile(alternative_taxonomy_path, taxonomy_file_path)
             self.tax_tree = etree.parse(
                 taxonomy_file_path,
                 parser=etree.XMLParser(remove_blank_text=True))
         else:
             logger.error(
                 f"Taxonomy file of SARS-CoV-2 was empty. Attempt to use the one from {ncbi_sc2_taxonomy_dir} "
                 f"failed because the filed doesn't exist. Can't proceed.")
             raise e
     # fetch latest source data
     download_dir = get_local_folder_for(
         source_name=self.name, _type=FileType.SequenceOrSampleData)
     self.sequence_file_path, self.metadata_file_path = download_or_get_sample_data(
         download_dir)
Ejemplo n.º 3
0
def download_or_get_sample_data(containing_directory: str) -> (str, str):
    """
    :return: the local file path of the downloaded sequence and metadata files.
    """
    def get_download_size(url) -> Optional[int]:
        """
        Returns the size of the downloadable resource in bytes supported by the protocol
        of the downloadable resource; None otherwise.
        """
        req = Request(url=url, method='HEAD')
        f = urlopen(req)
        if int(f.status) == 200:
            return int(f.headers['Content-Length'])
        else:
            return None

    def get_local_file_size(path) -> Optional[int]:
        """
        Returns the size of the local file in bytes if it exists; None otherwise.
        """
        if os.path.exists(path):
            return os.stat(path).st_size
        else:
            return None

    def download_coguk_data():
        # make sure the output path does not exist already, or wget assigns a trailing number to it
        logger.info(
            f'downloading sample sequences for COG-UK data from {COGUKSarsCov2.sequence_file_url} ...'
        )
        wget.download(COGUKSarsCov2.sequence_file_url,
                      sequence_local_file_path)
        logger.info(
            f'\ndownloading sample metadata for COG-UK data from {COGUKSarsCov2.metadata_file_url} ...'
        )
        wget.download(COGUKSarsCov2.metadata_file_url,
                      metadata_local_file_path)
        logger.info('\n')

    sequence_local_file_path = containing_directory + COGUKSarsCov2.sequence_file_url.rsplit(
        '/', maxsplit=1)[1]
    metadata_local_file_path = containing_directory + COGUKSarsCov2.metadata_file_url.rsplit(
        '/', maxsplit=1)[1]
    if not os.path.exists(sequence_local_file_path) or not os.path.exists(
            metadata_local_file_path):
        download_coguk_data()
    else:
        # compare size of local with remote ones
        if get_download_size(COGUKSarsCov2.sequence_file_url) != get_local_file_size(sequence_local_file_path) or \
                get_download_size(COGUKSarsCov2.metadata_file_url) != get_local_file_size(metadata_local_file_path):
            remove_file(sequence_local_file_path)
            remove_file(metadata_local_file_path)
            download_coguk_data()
    return sequence_local_file_path, metadata_local_file_path
Ejemplo n.º 4
0
 def coverage(self) -> Optional[int]:
     value = self._find_in_attributes('coverage')[1]
     if value is not None:
         try:
             # noinspection PyTypeChecker
             return round(float(value))
         except ValueError:
             remove_file(self.file_path)
             logger.error(
                 f'Error while parsing coverage string {value} from host XML {self.acc_id}. File removed.'
             )
             return None
Ejemplo n.º 5
0
 def submission_date(self) -> Optional[datetime]:
     date = self._find_in_attributes('receipt date')[1]
     if date:
         try:
             return datetime.strptime(date, '%Y-%m-%d')
         except ValueError as e:
             remove_file(self.file_path)
             logger.error(f"XML host file {self.acc_id} removed. {e.args}")
         except TypeError:
             return None
     else:
         return None
Ejemplo n.º 6
0
    def _find_in_attributes(self,
                            keyword,
                            exclude_keyword: Optional[str] = None):
        key = [k for k in self.attributes if keyword in k]
        if exclude_keyword is not None:
            key = [k for k in key if exclude_keyword not in k]

        if len(key) == 1:
            value = self.attributes[key[0]].lower()
            return key[0], value
        elif len(key) > 1:
            remove_file(self.file_path)
            raise AssertionError(
                f'NCBI XML host {self.acc_id} contains more than one valid {[k for k in key]} information. XML host file removed.'
            )
        else:
            return None, None
Ejemplo n.º 7
0
    def __init__(self, host_sample_accession_id: str, download_dir: str):
        self.acc_id = host_sample_accession_id
        self.file_path = download_or_get_ncbi_host_sample_as_xml(
            download_dir, self.acc_id)
        self.host_xml: lxml.etree.ElementTree = etree.parse(
            source=self.file_path,
            parser=etree.XMLParser(remove_blank_text=True))
        # with open(file_path) as f:
        #     for l in f.readlines():
        #         print(l)

        attribute_nodes_cleaned = []
        attribute_nodes = self.host_xml.xpath('.//Attributes/Attribute')
        # remove attributes with not applicable value
        for node in attribute_nodes:
            value = node.text.lower()
            # ignore values like "not applicable"/"not collected"/"restricted access"/"missing" values
            if 'not' not in value and 'missing' not in value and 'restricted' not in value:
                attribute_nodes_cleaned.append(node)
        # save attributes ina key-value pairs
        self.attributes = {}
        for node in attribute_nodes_cleaned:
            if 'attribute_name' in node.attrib:
                key = node.attrib['attribute_name'].lower()
            else:
                remove_file(self.file_path)
                raise ValueError(
                    f'NCBI XML host {self.acc_id} is not encoded in any know format. XML host file removed.'
                )
            value = node.text
            # insert key-value pair or change key
            if key in self.attributes.keys():
                if 'harmonized_name' in node.attrib:
                    key = node.attrib['harmonized_name'].lower()
                    if key in self.attributes.keys():
                        if 'display_name' in node.attrib:
                            key = node.attrib['display_name'].lower()
                            if key in self.attributes.keys():
                                remove_file(self.file_path)
                                raise ValueError(
                                    f'NCBI XML host {self.acc_id} is not encoded in any know format. XML host file removed.'
                                )
            self.attributes[key] = value
Ejemplo n.º 8
0
 def extract_epitopes_data():
     # unzip and gzip as requested by VirusGenoUtil library
     logger.info(
         "transforming downloaded files as required by VirusGenoUtil...")
     io_list = zip(
         (download_tcell_local_file_path, download_bcell_local_file_path,
          download_mhc_ligand_local_file_path),
         (tcell_file_name, bcell_file_name, mhc_ligand_file_name),
         (final_tcell_local_file_path, final_bcell_local_file_path,
          final_mhc_ligand_local_file_path))
     for downloaded_file_path, file_name, output_file_path in tqdm(io_list):
         inner_file_name = file_name + ".csv"
         # unzip downloaded file into inner_file_name
         with ZipFile(file=downloaded_file_path, mode='r') as zipped_file:
             zipped_file.extract(member=inner_file_name, path=download_dir)
         # gzip extracted file
         with open(file=download_dir + inner_file_name,
                   mode="rb") as inner_file:
             with gzip.open(output_file_path, mode='wb') as output_file:
                 shutil.copyfileobj(inner_file, output_file)
         # remove inner file as it is only an intermediate product
         remove_file(download_dir + inner_file_name)
Ejemplo n.º 9
0
 def update_source_data(self):
     logger.info("Downloading updates from source...")
     # read user credentials
     if not os.path.exists(self.credentials_path):
         with open(self.credentials_path, mode='w') as credentials_file:
             credentials_file.write("# Lines starting with # are comments.\n"
                                    "# Write in the following line <username>,<password> to use for downloading "
                                    "updated sequence data from GISAID.")
         raise AssertionError(f"No GISAID credentials provided. Please update the file at path {self.credentials_path}")
     with open(self.credentials_path, mode='r') as credentials_file:
         for line in credentials_file.readlines():
             if line.startswith("#"):
                 continue
             try:
                 username, psw = line.split(",")
                 username = username.strip().rstrip()
                 psw = psw.strip().rstrip()
             except Exception as e:
                 logger.error(f"Error encountered while parsing GISAID credentials file at path {self.credentials_path}")
                 raise e
         if not username or not psw:
             raise AssertionError(f"No GISAID credentials provided. Please update the file at path {self.credentials_path}")
     # download updated data from source
     download_path = get_local_folder_for(self.name, FileType.SequenceOrSampleData)
     download_path += "export_" + date.today().strftime("%Y-%b-%d") + ".json.bz2"
     remove_file(download_path)
     remove_file(self.data_path)
     os.system(f"wget --user {username} --password {psw} -O {download_path} https://www.epicov.org/epi3/3p/virusurf/export/export.json.bz2")
     if not exists(download_path):
         raise ValueError("download of https://www.epicov.org/epi3/3p/virusurf/export/export.json.bz2 with username "
                          f"'{username}' and password '{psw}' failed.")
     # extract archive to self.data_path
     with bz2.open(filename=download_path, mode='rt') as compressed_file:
         with open(file=self.data_path, mode="w") as decompressed_file:
             for line in compressed_file:
                 decompressed_file.write(line)
Ejemplo n.º 10
0
    def _find_in_attributes_(
        self,
        include_keywords: Collection[str],
        exclude_keywords: Optional[Collection[str]] = (),
        use_keyword_priority: bool = False
    ) -> Tuple[Optional[str], Optional[str]]:
        # find eligible attribute keys
        eligible_keys_1 = []
        for word in include_keywords:
            for k in self.attributes.keys():
                if word in k:
                    eligible_keys_1.append(k)

        eligible_keys_2 = []
        # ignore excluded attribute keywords
        for k in eligible_keys_1:
            excluded = False
            for word in exclude_keywords:
                if word in k:
                    excluded = True
                    break
            if not excluded:
                eligible_keys_2.append(k)

        if len(eligible_keys_2) == 1 or (len(eligible_keys_2) > 1
                                         and use_keyword_priority):
            # eligible_keys are already ordered in the same order as include_keywords
            value = self.attributes[eligible_keys_2[0]].lower()
            return eligible_keys_2[0], value
        elif len(eligible_keys_2) > 1:
            remove_file(self.file_path)
            raise AssertionError(
                f'NCBI XML host {self.acc_id} contains more than one valid {[k for k in eligible_keys_2]} information. XML host '
                f'file removed.')
        else:
            return None, None
Ejemplo n.º 11
0
 def download_epitopes_data():
     # make sure the output path does not exist already, or wget assigns a trailing number to it
     remove_file(download_bcell_local_file_path)
     remove_file(download_tcell_local_file_path)
     remove_file(download_mhc_ligand_local_file_path)
     logger.info(f'downloading tcell_full from {tcell_url} ...')
     wget.download(tcell_url, download_tcell_local_file_path)
     logger.info(f'downloading bcell_full from {bcell_url} ...')
     wget.download(bcell_url, download_bcell_local_file_path)
     logger.info(f'downloading mhc_ligand_full from {mhc_ligand_url} ...')
     wget.download(mhc_ligand_url, download_mhc_ligand_local_file_path)
     logger.info('\n')
Ejemplo n.º 12
0
def run(from_sample: Optional[int] = None, to_sample: Optional[int] = None):
    global virus, virus_id, import_method
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])
    virus = COGUKSarsCov2()
    # IMPORT VIRUS TAXON DATA
    virus_id = database.try_py_function(vcm.create_or_get_virus, virus)

    # update last import date
    database.try_py_function(vcm.update_db_metadata, virus_id, 'COG-UK')

    # find outdated and new samples from source (some sequences can be updated, so the sets are not necessarily disjoint)
    logger.warning(
        "Current implementation of deltas for COG-UK uses more than 10 GB of RAM to cache query results and save time.\n"
        "IF YOUR SYSTEM CAN'T PROVIDE MORE THAN 10 GB OF RAM, STOP THE PROCESS NOW.\n"
        "The program will resume in 15 seconds")
    try:
        sleep(15)
    except KeyboardInterrupt:
        return
    id_outdated_sequences, id_new_sequences = virus.deltas()
    logger.warning('Check deltas.. The program will resume in 30 seconds.')
    try:
        sleep(30)
    except KeyboardInterrupt:
        return

    # select range
    if from_sample is not None and to_sample is not None:
        id_new_sequences = {
            id_new_sequences.pop()
            for i in range(to_sample - from_sample)
        }

    # create pipeline_event (will be inserted later)
    pipeline_event = database.PipelineEvent(
        event_date=datetime.now().strftime("%Y-%m-%d"),
        event_name=f'COGUK sars_cov_2 sequences update',
        removed_items=len(id_outdated_sequences),
        changed_items=0,
        added_items=len(
            id_new_sequences
        ),  # may eventually change if some sequence are not imported
    )

    # initialize statistics module
    stats_module.schedule_samples(
        stats_module.StatsBasedOnIds(id_new_sequences, True, virus_id,
                                     ['COG-UK']))

    # remove outdated sequences
    logger.info(f'removing outdated sequences')
    database.try_py_function(
        vcm.remove_sequence_and_meta_list,
        primary_sequence_accession_id=id_outdated_sequences)
    stats_module.removed_samples(id_outdated_sequences)
    for _id in id_outdated_sequences:
        file_path = get_local_folder_for(
            virus.name, FileType.Annotations) + str(_id).replace(
                '/', '-') + ".pickle"
        remove_file(file_path)

    # prepare multiprocessing
    logger.info(f'importing virus sequences and related tables')
    import_method = Parallel()

    vcm.DBCache.commit_changes()
    for s in virus.get_sequences_of_updated_source(
            filter_accession_ids=id_new_sequences):
        if not s.nucleotide_sequence():
            logger.info(
                f'sample {s.primary_accession_number()} skipped because nucleotide sequence is empty or null'
            )
            continue
        try:
            database.try_py_function(import_method.import_virus_sample, s)
            vcm.DBCache.commit_changes()
        except:
            logger.exception(
                f'exception occurred while working on virus sample {s.primary_accession_number()}'
            )
            vcm.DBCache.rollback_changes()

    logger.info('main process completed')
    import_method.tear_down()

    # remove leftovers of failed samples
    try:
        metadata_samples_to_remove: set = stats_module.get_scheduled_not_completed(
        )
        if len(metadata_samples_to_remove) > 1100:
            send_message(
                f"COGUK importer can have a bug. {len(metadata_samples_to_remove)} out of "
                f"{len(id_new_sequences)} failed.")
        pipeline_event.added_items = pipeline_event.added_items - len(
            metadata_samples_to_remove)
        if len(metadata_samples_to_remove) > 0:
            logger.info(
                f"Removing metadata leftovers of imports that failed during variant/annotation calling or metadata"
                f" ({len(metadata_samples_to_remove)} samples)")

            metadata_samples_to_remove_as_string: list = [
                str(x) for x in metadata_samples_to_remove
            ]
            logger.trace("Accession id of failed imports:\n"
                         f"{metadata_samples_to_remove_as_string}")
            logger.info("Deleting leftovers in database")
            database.try_py_function(vcm.remove_sequence_and_meta_list,
                                     primary_sequence_accession_id=
                                     metadata_samples_to_remove_as_string)
    except:
        logger.exception(
            "Removal of metadata leftovers in the DB of the samples that failed was not successful."
        )

    database.try_py_function(vcm.insert_data_update_pipeline_event,
                             pipeline_event)