コード例 #1
0
 def __init__(self):
     logger.info(
         f'importing virus {self.name} using NCBI SC2 for taxonomy data')
     # fetch taxonomy data from NCBI
     taxonomy_file_path = download_ncbi_taxonomy_as_xml(
         get_local_folder_for(source_name=self.name,
                              _type=FileType.TaxonomyData), self.taxon_id())
     try:
         self.tax_tree = etree.parse(
             taxonomy_file_path,
             parser=etree.XMLParser(remove_blank_text=True))
     except etree.XMLSyntaxError as e:  # happens on AWS if for some reason the downloaded file is corrupted
         remove_file(taxonomy_file_path)
         ncbi_sc2_taxonomy_dir = get_local_folder_for(
             source_name=ncbi_known_settings["sars_cov_2"]
             ["generated_dir_name"],
             _type=FileType.TaxonomyData)
         alternative_taxonomy_path = ncbi_sc2_taxonomy_dir + f"{ncbi_known_settings['sars_cov_2']['virus_taxon_id']}.xml"
         if os.path.exists(alternative_taxonomy_path):
             shutil.copyfile(alternative_taxonomy_path, taxonomy_file_path)
             self.tax_tree = etree.parse(
                 taxonomy_file_path,
                 parser=etree.XMLParser(remove_blank_text=True))
         else:
             logger.error(
                 f"Taxonomy file of SARS-CoV-2 was empty. Attempt to use the one from {ncbi_sc2_taxonomy_dir} "
                 f"failed because the filed doesn't exist. Can't proceed.")
             raise e
     # fetch latest source data
     download_dir = get_local_folder_for(
         source_name=self.name, _type=FileType.SequenceOrSampleData)
     self.sequence_file_path, self.metadata_file_path = download_or_get_sample_data(
         download_dir)
コード例 #2
0
def main_pipeline_part_3(session: database.Session, sample, db_sequence_id):
    file_path = get_local_folder_for(virus.name, FileType.Annotations) + str(
        sample.internal_id()) + ".pickle"
    try:
        if not os.path.exists(file_path):
            annotations_and_nuc_variants = sequence_aligner(
                sample.internal_id(), reference_sequence,
                sample.nucleotide_sequence(), sc2_chromosome,
                sc2_annotations_file_path, sc2_snpeff_db_name)
            with open(file_path, mode='wb') as cache_file:
                pickle.dump(annotations_and_nuc_variants,
                            cache_file,
                            protocol=pickle.HIGHEST_PROTOCOL)
        else:
            with open(file_path, mode='rb') as cache_file:
                annotations_and_nuc_variants = pickle.load(cache_file)
        annotations, nuc_variants = annotations_and_nuc_variants
        for ann in annotations:
            vcm.create_annotation_and_amino_acid_variants(
                session, db_sequence_id, *ann)
        for nuc in nuc_variants:
            vcm.create_nuc_variants_and_impacts(session, db_sequence_id, nuc)
        stats_module.completed_sample(sample.primary_accession_number())
    except Exception as e:
        if str(e).endswith("sequence contains letters not in the alphabet"):
            logger.warning(
                f"sample {sample.primary_accession_number()} skipped because sequence contains letter not in "
                f"the alphabet")
        else:
            logger.exception(
                f'exception occurred during pipeline_part_3 of sample {sample.primary_accession_number()}. '
                f'Doing rollback of insertion of variants and annotations + deletion of cache'
            )
        remove_file(file_path)
        raise e
コード例 #3
0
def generate_fasta(virus_taxon_id: int,
                   virus_folder_name: str,
                   generated_file_name: str,
                   only_null_lineages: bool = False) -> str:
    """
    Generates a multi fasta file containing all the sequences of the given virus.

    :return: the absolute path to the generated fasta file.
    """
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])

    virus_db_id = virus_database_id(virus_taxon_id)

    if virus_db_id is None:
        raise Exception('Before running this algorithm, create the '
                        f'virus associated with taxon {virus_taxon_id}')

    def get_acc_ids_and_sequences_from_db(
            session: database.Session) -> Generator[Tuple, None, None]:
        query = session.query(database.Sequence.accession_id,
                                     database.NucleotideSequence.nucleotide_sequence)\
            .filter(database.Sequence.virus_id == virus_db_id,
                    database.Sequence.sequence_id == database.NucleotideSequence.sequence_id)
        if only_null_lineages:
            query = query.filter(database.Sequence.lineage == None)
        for pair in query.all():
            yield pair[0], pair[1]

    def get_total_acc_ids_from_db(session: database.Session) -> int:
        query = session.query(func.count(database.Sequence.accession_id)) \
            .filter(database.Sequence.virus_id == virus_db_id)
        if only_null_lineages:
            query = query.filter(database.Sequence.lineage == None)
        return query.first()[0]

    target_file_path = get_local_folder_for(
        virus_folder_name, FileType.Fasta) + generated_file_name
    logger.info(f"Generating fasta...")
    with open(file=target_file_path, mode='w') as file:
        total_count = database.try_py_function(get_total_acc_ids_from_db)
        print(total_count)
        data = database.try_py_function(get_acc_ids_and_sequences_from_db)
        progress = tqdm(total=total_count)
        if total_count > 0:
            first = next(data)
            file.write(f'>{first[0]}\n')
            file.write(first[1])
            progress.update()
        for acc_id_and_sequences in data:
            file.write(f'\n>{acc_id_and_sequences[0]}\n')
            file.write(acc_id_and_sequences[1])
            progress.update()
    target_file_path = abspath(target_file_path)
    logger.info(f"Fasta file generated at {target_file_path}")
    return target_file_path
コード例 #4
0
def download_refseq_of_viruses():
    for virus_key_name, import_parameter in ncbi_knonw_settings.items():
        virus_dir_name = import_parameter["generated_dir_name"]

        virus_dir_path = get_local_folder_for(virus_dir_name,
                                              FileType.SequenceOrSampleData)
        refseq_query = import_parameter["reference_sample_query"]
        refseq_sample_acc_id = get_samples_accession_ids(refseq_query)
        assert len(
            refseq_sample_acc_id
        ) == 1, f'Invalid reference sequence for virus {import_parameter["log_with_name"]}'
        path_of_refseq = download_or_get_ncbi_sample_as_xml(
            virus_dir_path, refseq_sample_acc_id[0])
        yield path_of_refseq
コード例 #5
0
 def reference_sequence(self) -> str:
     if not hasattr(self, 'reference_sample'):
         # import a reference sequence from a different dataset that we'll use to call nucleotide variants
         # get reference sample accession id
         ncbi_reference_sample_query = ncbi_known_settings["sars_cov_2"][
             "reference_sample_query"]
         reference_accession_id = get_samples_accession_ids(
             ncbi_reference_sample_query)
         assert len(reference_accession_id) == 1, \
             "no reference sample found or multiple RefSeqs. Please correct the query used on NCBI nuccore"
         # download file as XML
         reference_seq_file_path = download_or_get_ncbi_sample_as_xml(
             get_local_folder_for(source_name=self.name,
                                  _type=FileType.SequenceOrSampleData),
             reference_accession_id[0])
         # parse file and cache the object
         self.reference_sample = AnyNCBIVNucSample(
             reference_seq_file_path, reference_accession_id[0])
     return self.reference_sample.nucleotide_sequence()
コード例 #6
0
 def nucleotide__annotations__pipeline(session: database.Session,
                                       a_sample: NMDCVirusSample,
                                       db_sequence_id):
     if a_sample.taxon_name(
     ) == 'Severe acute respiratory syndrome coronavirus 2':
         refseq = refseq_sc2
     elif a_sample.taxon_name() == 'Bat SARS-related coronavirus':
         refseq = refseq_sc1
     else:
         raise Exception(f'unknown taxon organism {a_sample.taxon_name()}')
     try:
         file_path = get_local_folder_for(
             'NMDC', FileType.Annotations) + str(
                 sample.primary_accession_number()) + ".pickle"
         if not os.path.exists(file_path):
             annotations_and_nuc_variants = sequence_aligner(
                 db_sequence_id, refseq, a_sample.nucleotide_sequence(),
                 sc2_chr_name, sc2_ann_file_path, sc2_snpeff_db_name)
             with open(file_path, mode='wb') as cache_file:
                 pickle.dump(annotations_and_nuc_variants,
                             cache_file,
                             protocol=pickle.HIGHEST_PROTOCOL)
         else:
             with open(file_path, mode='rb') as cache_file:
                 annotations_and_nuc_variants = pickle.load(cache_file)
         annotations, nuc_variants = annotations_and_nuc_variants
         for ann in annotations:
             vcm.create_annotation_and_amino_acid_variants(
                 session, db_sequence_id, *ann)
         for nuc in nuc_variants:
             vcm.create_nuc_variants_and_impacts(session, db_sequence_id,
                                                 nuc)
         stats_module.completed_sample(sample.primary_accession_number())
     except Exception:
         logger.exception(
             f'exception occurred while working on annotations and nuc_variants of virus sample '
             f'{a_sample.primary_accession_number()}. Rollback transaction.'
         )
         raise database.Rollback()
コード例 #7
0
 def update_source_data(self):
     logger.info("Downloading updates from source...")
     # read user credentials
     if not os.path.exists(self.credentials_path):
         with open(self.credentials_path, mode='w') as credentials_file:
             credentials_file.write("# Lines starting with # are comments.\n"
                                    "# Write in the following line <username>,<password> to use for downloading "
                                    "updated sequence data from GISAID.")
         raise AssertionError(f"No GISAID credentials provided. Please update the file at path {self.credentials_path}")
     with open(self.credentials_path, mode='r') as credentials_file:
         for line in credentials_file.readlines():
             if line.startswith("#"):
                 continue
             try:
                 username, psw = line.split(",")
                 username = username.strip().rstrip()
                 psw = psw.strip().rstrip()
             except Exception as e:
                 logger.error(f"Error encountered while parsing GISAID credentials file at path {self.credentials_path}")
                 raise e
         if not username or not psw:
             raise AssertionError(f"No GISAID credentials provided. Please update the file at path {self.credentials_path}")
     # download updated data from source
     download_path = get_local_folder_for(self.name, FileType.SequenceOrSampleData)
     download_path += "export_" + date.today().strftime("%Y-%b-%d") + ".json.bz2"
     remove_file(download_path)
     remove_file(self.data_path)
     os.system(f"wget --user {username} --password {psw} -O {download_path} https://www.epicov.org/epi3/3p/virusurf/export/export.json.bz2")
     if not exists(download_path):
         raise ValueError("download of https://www.epicov.org/epi3/3p/virusurf/export/export.json.bz2 with username "
                          f"'{username}' and password '{psw}' failed.")
     # extract archive to self.data_path
     with bz2.open(filename=download_path, mode='rt') as compressed_file:
         with open(file=self.data_path, mode="w") as decompressed_file:
             for line in compressed_file:
                 decompressed_file.write(line)
コード例 #8
0
class GISAIDSarsCov2(VirusSource):

    name = 'gisaid_sars_cov_2'
    data_path = get_local_folder_for(name, FileType.SequenceOrSampleData) + 'export.json'
    credentials_path = f'.{sep}data_sources{sep}gisaid_sars_cov_2{sep}credentials_gisaid.csv'

    def __init__(self):
        super().__init__()
        logger.info(f'importing virus {GISAIDSarsCov2.name}')
        self.update_source_data()

    def taxon_id(self):
        return 2697049

    def taxon_name(self):
        return 'Severe acute respiratory syndrome coronavirus 2'

    def family(self):
        return 'Coronaviridae'

    def sub_family(self):
        return 'Orthocoronavirinae'

    def genus(self):
        return 'Betacoronavirus'

    def species(self):
        return 'Severe acute respiratory syndrome-related coronavirus'

    def equivalent_names(self):
        return 'SARS-CoV-2, 2019-nCoV, COVID-19, COVID-19 virus, COVID19, HCoV-19, Human coronavirus 2019, SARS-2, SARS-CoV2, SARS2, Wuhan coronavirus, Wuhan seafood market pneumonia virus'

    def molecule_type(self):
        return 'RNA'

    def is_single_stranded(self):
        return True

    def is_positive_stranded(self):
        return True

    def count_sequences_in_file(self):
        with open(self.data_path, mode='r') as input_file:
            num_lines = sum(1 for line in input_file)
        return num_lines

    def update_source_data(self):
        logger.info("Downloading updates from source...")
        # read user credentials
        if not os.path.exists(self.credentials_path):
            with open(self.credentials_path, mode='w') as credentials_file:
                credentials_file.write("# Lines starting with # are comments.\n"
                                       "# Write in the following line <username>,<password> to use for downloading "
                                       "updated sequence data from GISAID.")
            raise AssertionError(f"No GISAID credentials provided. Please update the file at path {self.credentials_path}")
        with open(self.credentials_path, mode='r') as credentials_file:
            for line in credentials_file.readlines():
                if line.startswith("#"):
                    continue
                try:
                    username, psw = line.split(",")
                    username = username.strip().rstrip()
                    psw = psw.strip().rstrip()
                except Exception as e:
                    logger.error(f"Error encountered while parsing GISAID credentials file at path {self.credentials_path}")
                    raise e
            if not username or not psw:
                raise AssertionError(f"No GISAID credentials provided. Please update the file at path {self.credentials_path}")
        # download updated data from source
        download_path = get_local_folder_for(self.name, FileType.SequenceOrSampleData)
        download_path += "export_" + date.today().strftime("%Y-%b-%d") + ".json.bz2"
        remove_file(download_path)
        remove_file(self.data_path)
        os.system(f"wget --user {username} --password {psw} -O {download_path} https://www.epicov.org/epi3/3p/virusurf/export/export.json.bz2")
        if not exists(download_path):
            raise ValueError("download of https://www.epicov.org/epi3/3p/virusurf/export/export.json.bz2 with username "
                             f"'{username}' and password '{psw}' failed.")
        # extract archive to self.data_path
        with bz2.open(filename=download_path, mode='rt') as compressed_file:
            with open(file=self.data_path, mode="w") as decompressed_file:
                for line in compressed_file:
                    decompressed_file.write(line)

    def get_sequences_in_current_data(self) -> dict:
        def do(session):
            all_sequences = dict()
            for db_items in session.query(Sequence, HostSample, SequencingProject) \
                                      .filter(Sequence.strain_name.isnot(None),
                                              Sequence.host_sample_id == HostSample.host_sample_id,
                                              Sequence.sequencing_project_id == SequencingProject.sequencing_project_id) \
                                      .yield_per(100):
                source_seq, source_host, source_prj = db_items
                all_sequences[source_seq.accession_id] = (source_seq.strain_name, int(source_seq.length),
                                                          source_seq.gc_percentage, source_seq.n_percentage,
                                                          source_host.collection_date, source_host.originating_lab,
                                                          str(source_prj.submission_date) if source_prj.submission_date is not None else None,
                                                          source_prj.sequencing_lab,
                                                          source_host.country, source_host.region,
                                                          source_host.isolation_source)
            return all_sequences

        return try_py_function(do)

    def get_sequences_of_updated_source(self) -> Generator[GISAIDSarsCov2Sample, None, None]:
        with open(self.data_path, mode='r') as input_file:
            for line in input_file:
                try:
                    yield GISAIDSarsCov2Sample(json.loads(line))
                except JSONDecodeError:
                    pass

    # ### COMPARE ANNOTATIONS AND AMINO ACID VARIANTS ### #
    # It's far more efficient to download the annotations of a group of sequences altogether instead of
    # downloading them for each sequence at a time
    def collect_aa_variants_from_db(self, sample_acc_ids: list):
        all_db_annotations = dict()     # acc_id -> [(aa_var1), (aa_var_2), ...]

        def do(session):
            nonlocal all_db_annotations
            all_aa_variants = session \
                .query(Sequence.accession_id, Annotation.start, Annotation.stop, Annotation.gene_name,
                       Annotation.feature_type, Annotation.product,
                       AminoAcidVariant.start_aa_original, AminoAcidVariant.sequence_aa_original,
                       AminoAcidVariant.sequence_aa_alternative) \
                .select_from(Annotation, AminoAcidVariant, Sequence) \
                .filter(Annotation.annotation_id == AminoAcidVariant.annotation_id,
                        Annotation.sequence_id == Sequence.sequence_id,
                        Sequence.accession_id.in_(sample_acc_ids)) \
                .order_by(Sequence.accession_id, Annotation.start, AminoAcidVariant.start_aa_original) \
                .all()
            for acc_id, ann_start, ann_stop, gene_name, feature_type, product, \
                start_aa_original, sequence_aa_original,sequence_aa_alternative in all_aa_variants:
                # group aa_variants by accession id into a map
                try:
                    seq_annotations = all_db_annotations[acc_id]
                except KeyError:
                    seq_annotations = list()
                    all_db_annotations[acc_id] = seq_annotations
                content = (ann_start, ann_stop, gene_name, feature_type, product,
                           start_aa_original, sequence_aa_original, sequence_aa_alternative)
                seq_annotations.append(content)

        try_py_function(do)
        return all_db_annotations

    def annotations_changed(self, local_annotations_list: List[Tuple], sample: GISAIDSarsCov2Sample):
        local_ann = Counter(local_annotations_list)
        updated_ann = Counter(self.collect_aa_variants_from_file(sample))
        annotations_in_db_not_in_file = local_ann - updated_ann
        annotations_in_file_not_in_db = updated_ann - local_ann

        if sum(annotations_in_db_not_in_file.values()) > 0 or sum(annotations_in_file_not_in_db.values()) > 0:
            return True
        else:
            return False

    def collect_aa_variants_from_file(self, sample: GISAIDSarsCov2Sample):
        formatted_annotations = []
        for start, stop, feature_type, gene_name, product, db_xref_merged, amino_acid_sequence, _mutations in sample.annotations_and_amino_acid_variants():
            for mut in _mutations:
                original_aa, alternative_aa, start_pos, length, _type = mut
                content = (start, stop, gene_name, feature_type, product, start_pos, original_aa, alternative_aa)
                formatted_annotations.append(content)
        return formatted_annotations

    def deltas(self):
        """
        Returns the set of Sequence.accession_id to remove from current database, and the set of Sequence.accession_id
        that should be imported from the source. Note that because sequences in the current data may have been updated,
        the two sets are not necessarily disjoint. So first remove the sequences to be removed and only then
         insert the new ones.
         """
        acc_id_remote = set([x.primary_accession_number() for x in self.get_sequences_of_updated_source()])  # all the sequences from remote
        logger.info('Collecting current metadata...')
        current_data = self.get_sequences_in_current_data()
        acc_id_current = set(current_data.keys())

        acc_id_missing_in_remote = acc_id_current - acc_id_remote
        acc_id_missing_in_current = acc_id_remote - acc_id_current

        acc_id_present_in_current_and_remote = acc_id_current & acc_id_remote
        acc_id_changed_updatable = dict()
        acc_id_changed_not_updatable = []

        changes_distribution = {
            "sequence": 0,
            "host_sample": 0,
            "sequencing_project": 0,
            "annotations": 0
        }

        # collect annotations from DB only for sequences that can have changes
        logger.info('Collecting current annotations...')
        aa_variants_local = self.collect_aa_variants_from_db(list(acc_id_present_in_current_and_remote))
        # to compare sequences present in both, have to scan the file of remote sequences
        logger.info('Comparing current data with updated source data')
        for new_sequence in tqdm(self.get_sequences_of_updated_source(), total=self.count_sequences_in_file()):
            acc_id = new_sequence.primary_accession_number()
            try:
                current_sequence_data = current_data[acc_id]
                # if the gisaid id is the same, compare metadata to detect if the sample changed over time

                # if strain or length changes, overlaps of this sequene become invalid, otherwise they can be kept
                if current_sequence_data[0] != new_sequence.strain() \
                        or current_sequence_data[1] != new_sequence.length():
                    acc_id_changed_not_updatable.append(acc_id)
                else:
                    # detect which tables have changes
                    changes = {
                        "sequence": False,
                        "host_sample": False,
                        "sequencing_project": False,
                        "annotations": False
                    }
                    # changes in sequence table (implies a likely change in the sequence and variants too)
                    if current_sequence_data[2] != new_sequence.gc_percent() \
                            or current_sequence_data[3] != new_sequence.n_percent():
                        changes["sequence"] = True
                        changes_distribution["sequence"] = changes_distribution["sequence"] + 1
                    if self.annotations_changed(aa_variants_local[acc_id], new_sequence):
                        changes["annotations"] = True
                        changes_distribution["annotations"] = changes_distribution["annotations"] + 1
                    # changes in host sample table
                    if current_sequence_data[4] != new_sequence.collection_date() \
                            or current_sequence_data[5] != new_sequence.originating_lab() \
                            or (current_sequence_data[8], current_sequence_data[9]) != \
                                new_sequence.country__region__geo_group()[:2] \
                            or current_sequence_data[10] != new_sequence.isolation_source():
                        changes["host_sample"] = True
                        changes_distribution["host_sample"] = changes_distribution["host_sample"] + 1
                    # changes in sequencing project table
                    if current_sequence_data[6] != new_sequence.submission_date() \
                            or current_sequence_data[7] != new_sequence.sequencing_lab():
                        changes["sequencing_project"] = True
                        changes_distribution["sequencing_project"] = changes_distribution["sequencing_project"] + 1

                    if True in changes.values():
                        acc_id_changed_updatable[acc_id] = changes
            except KeyError:
                pass  # the accession id is not present in current data. it's a new sequence

        # compute additional sets
        acc_id_changed_not_updatable = set(acc_id_changed_not_updatable)
        acc_id_unchanged = acc_id_present_in_current_and_remote - acc_id_changed_updatable.keys() - acc_id_changed_not_updatable

        acc_id_to_remove = acc_id_missing_in_remote | acc_id_changed_not_updatable
        acc_id_to_import = acc_id_missing_in_current | acc_id_changed_not_updatable

        logger.info(f'\n'
                    f'# Sequences from remote source: {len(acc_id_remote)}. Of which\n'
                    f'# {len(acc_id_unchanged)} present also locally and unchanged\n'
                    f'# {len(acc_id_missing_in_current)} never seen before.\n'
                    f'# {len(acc_id_changed_not_updatable)} have changes in the remote source that impact overlaps (strain/length)\n'  # TODO temporary info
                    f"# {len(acc_id_changed_updatable)} have changes that don't affect overlaps\n"
                    f'# Sequences from local source: {len(acc_id_current)}. Of which\n'
                    f'# {len(acc_id_missing_in_remote)} are missing from remote and must be removed from local.\n'
                    f'# In conclusion: {len(acc_id_to_remove)} sequences will be removed because missing or changed strain/length in remote\n'
                    f'# {len(acc_id_to_import)} sequences will be imported because novel or changed strain/length in remote\n'
                    f'# {len(acc_id_changed_updatable)} sequence will be updated with changes from remote.')

        logger.info(f"distribution of updates: {changes_distribution}")

        return acc_id_to_remove, acc_id_to_import, acc_id_changed_updatable
コード例 #9
0
def import_samples_into_vcm():
    global fasta_list, refseq_sc1, refseq_sc2, refseq_sc1_len, refseq_sc2_len, cached_taxonomy, fasta_folder, \
        taxonomy_folder, imported_viruses
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])
    fasta_folder = get_local_folder_for('NMDC', FileType.SequenceOrSampleData)
    taxonomy_folder = get_local_folder_for('NMDC', FileType.TaxonomyData)
    fasta_list = get_fasta_list()
    stats_module.schedule_samples(set_info=stats_module.StatsBasedOnIds(
        [x.rstrip('.fasta') for x in fasta_list], True))
    logger.warning(
        f'{len(fasta_list)} files found at {base_url}. Some of them may be skipped because they have not metadata'
        f' or because they are not realted to SARS or SARS-coV2.')
    logger.info(f'downloading fasta files')
    download_fastas()

    # REFRENCE SEQUENCE
    refseq_sc2 = reference_sequence(
        "(txid2697049[Organism]) AND srcdb_refseq[Properties]")
    refseq_sc2_len = len(refseq_sc2)
    refseq_sc1 = reference_sequence(
        "txid694009[Organism:exp] NOT txid2697049[Organism] AND srcdb_refseq[Properties]"
    )
    refseq_sc1_len = len(refseq_sc1)

    def virus_taxonomy_pipeline(session: database, taxon: AnyNCBITaxon):
        return vcm.create_or_get_virus(session, taxon)

    # noinspection PyTypeChecker
    def metadata_pipeline(session: database.Session,
                          a_sample: NMDCVirusSample):
        try:
            experiment_id = vcm.create_or_get_experiment(session, a_sample)
            host_specie_id = vcm.create_or_get_host_specie(session, a_sample)
            host_sample_id = vcm.create_or_get_host_sample(
                session, a_sample, host_specie_id)
            sequencing_project_id = vcm.create_or_get_sequencing_project(
                session, a_sample)
            sequence, nucleotide_seq = vcm.create_and_get_sequence(
                session, a_sample, virus_id, experiment_id, host_sample_id,
                sequencing_project_id)
            vcm.DBCache.commit_changes()
            return sequence.sequence_id
        except Exception as e:
            if str(e).startswith(
                    'duplicate key value violates unique constraint "sequence_accession_id_key"'
            ):
                logger.error(
                    f'exception occurred while working on virus sample {a_sample}: {str(e)}'
                )
            else:
                logger.exception(
                    f'exception occurred while working on virus sample {a_sample}'
                )
            vcm.DBCache.rollback_changes()
            raise database.Rollback()

    def nucleotide__annotations__pipeline(session: database.Session,
                                          a_sample: NMDCVirusSample,
                                          db_sequence_id):
        if a_sample.taxon_name(
        ) == 'Severe acute respiratory syndrome coronavirus 2':
            refseq = refseq_sc2
        elif a_sample.taxon_name() == 'Bat SARS-related coronavirus':
            refseq = refseq_sc1
        else:
            raise Exception(f'unknown taxon organism {a_sample.taxon_name()}')
        try:
            file_path = get_local_folder_for(
                'NMDC', FileType.Annotations) + str(
                    sample.primary_accession_number()) + ".pickle"
            if not os.path.exists(file_path):
                annotations_and_nuc_variants = sequence_aligner(
                    db_sequence_id, refseq, a_sample.nucleotide_sequence(),
                    sc2_chr_name, sc2_ann_file_path, sc2_snpeff_db_name)
                with open(file_path, mode='wb') as cache_file:
                    pickle.dump(annotations_and_nuc_variants,
                                cache_file,
                                protocol=pickle.HIGHEST_PROTOCOL)
            else:
                with open(file_path, mode='rb') as cache_file:
                    annotations_and_nuc_variants = pickle.load(cache_file)
            annotations, nuc_variants = annotations_and_nuc_variants
            for ann in annotations:
                vcm.create_annotation_and_amino_acid_variants(
                    session, db_sequence_id, *ann)
            for nuc in nuc_variants:
                vcm.create_nuc_variants_and_impacts(session, db_sequence_id,
                                                    nuc)
            stats_module.completed_sample(sample.primary_accession_number())
        except Exception:
            logger.exception(
                f'exception occurred while working on annotations and nuc_variants of virus sample '
                f'{a_sample.primary_accession_number()}. Rollback transaction.'
            )
            raise database.Rollback()

    # create pipeline_event (will be inserted later)
    pipeline_event = database.PipelineEvent(
        event_date=datetime.now().strftime("%Y-%m-%d"),
        event_name=f'NMDC sars_cov_2 sequences update',
        removed_items=0,
        changed_items=0)

    logger.info('begin import of selected records')
    vcm.DBCache.commit_changes()
    total_sequences_imported = 0
    total_sequences_skipped = 0
    log_of_gisaid_id_path = f"{get_local_folder_for('NMDC', FileType.Logs)}{os.path.sep}gisa_ids.txt"
    with open(log_of_gisaid_id_path, mode='w') as log_of_gisaid_id:
        for file in tqdm(fasta_list):
            try:
                sample = NMDCVirusSample(file)

                # filter samples by organism
                organism_name = sample.taxon_name()
                if organism_name != 'Severe acute respiratory syndrome coronavirus 2':
                    logger.info(
                        f'Sample {file} skipped because related to organims {organism_name}'
                    )
                    total_sequences_skipped += 1
                    continue
                # download taxonomy for new organisms
                organism = cached_taxonomy.get('organism_name')
                if not organism:
                    organism_file = download_ncbi_taxonomy_as_xml_from_name(
                        taxonomy_folder, organism_name)
                    organism = AnyNCBITaxon(organism_file)
                    cached_taxonomy[organism_name] = organism
            except FileNotFoundError:
                logger.error(f'Sample {file} skipped')
                total_sequences_skipped += 1
                continue
            except AssertionError:
                logger.exception(f'Sample {file} skipped')
                total_sequences_skipped += 1
                continue

            # virus id associated to this sample
            virus_id = database.try_py_function(virus_taxonomy_pipeline,
                                                organism)
            if virus_id not in imported_viruses:
                imported_viruses.add(virus_id)
                database.try_py_function(vcm.update_db_metadata, virus_id,
                                         'NMDC')
                vcm.DBCache.commit_changes()
            if virus_id:
                gisa_id = sample.gisa_id()
                if gisa_id:
                    log_of_gisaid_id.write(gisa_id + '\n')

                # import sample
                sequence_id = database.try_py_function(metadata_pipeline,
                                                       sample)
                vcm.DBCache.commit_changes()
                if sequence_id:
                    database.try_py_function(nucleotide__annotations__pipeline,
                                             sample, sequence_id)
                total_sequences_imported += 1

        logger.info(f'{total_sequences_imported} sequences imported.')
        logger.info(f'{total_sequences_skipped} sequences skipped.')
        if total_sequences_skipped > 100:
            send_message(
                f"NMDC importer can have a bug. {total_sequences_skipped} out of "
                f"{total_sequences_skipped+total_sequences_imported} were not imported."
            )

        logger.info(f'list of sequences with GISAID references at path: ' +
                    log_of_gisaid_id_path)

    pipeline_event.added_items = total_sequences_imported
    database.try_py_function(vcm.insert_data_update_pipeline_event,
                             pipeline_event)
コード例 #10
0
def run(from_sample: Optional[int] = None, to_sample: Optional[int] = None):
    global virus, virus_id, import_method
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])
    virus = COGUKSarsCov2()
    # IMPORT VIRUS TAXON DATA
    virus_id = database.try_py_function(vcm.create_or_get_virus, virus)

    # update last import date
    database.try_py_function(vcm.update_db_metadata, virus_id, 'COG-UK')

    # find outdated and new samples from source (some sequences can be updated, so the sets are not necessarily disjoint)
    logger.warning(
        "Current implementation of deltas for COG-UK uses more than 10 GB of RAM to cache query results and save time.\n"
        "IF YOUR SYSTEM CAN'T PROVIDE MORE THAN 10 GB OF RAM, STOP THE PROCESS NOW.\n"
        "The program will resume in 15 seconds")
    try:
        sleep(15)
    except KeyboardInterrupt:
        return
    id_outdated_sequences, id_new_sequences = virus.deltas()
    logger.warning('Check deltas.. The program will resume in 30 seconds.')
    try:
        sleep(30)
    except KeyboardInterrupt:
        return

    # select range
    if from_sample is not None and to_sample is not None:
        id_new_sequences = {
            id_new_sequences.pop()
            for i in range(to_sample - from_sample)
        }

    # create pipeline_event (will be inserted later)
    pipeline_event = database.PipelineEvent(
        event_date=datetime.now().strftime("%Y-%m-%d"),
        event_name=f'COGUK sars_cov_2 sequences update',
        removed_items=len(id_outdated_sequences),
        changed_items=0,
        added_items=len(
            id_new_sequences
        ),  # may eventually change if some sequence are not imported
    )

    # initialize statistics module
    stats_module.schedule_samples(
        stats_module.StatsBasedOnIds(id_new_sequences, True, virus_id,
                                     ['COG-UK']))

    # remove outdated sequences
    logger.info(f'removing outdated sequences')
    database.try_py_function(
        vcm.remove_sequence_and_meta_list,
        primary_sequence_accession_id=id_outdated_sequences)
    stats_module.removed_samples(id_outdated_sequences)
    for _id in id_outdated_sequences:
        file_path = get_local_folder_for(
            virus.name, FileType.Annotations) + str(_id).replace(
                '/', '-') + ".pickle"
        remove_file(file_path)

    # prepare multiprocessing
    logger.info(f'importing virus sequences and related tables')
    import_method = Parallel()

    vcm.DBCache.commit_changes()
    for s in virus.get_sequences_of_updated_source(
            filter_accession_ids=id_new_sequences):
        if not s.nucleotide_sequence():
            logger.info(
                f'sample {s.primary_accession_number()} skipped because nucleotide sequence is empty or null'
            )
            continue
        try:
            database.try_py_function(import_method.import_virus_sample, s)
            vcm.DBCache.commit_changes()
        except:
            logger.exception(
                f'exception occurred while working on virus sample {s.primary_accession_number()}'
            )
            vcm.DBCache.rollback_changes()

    logger.info('main process completed')
    import_method.tear_down()

    # remove leftovers of failed samples
    try:
        metadata_samples_to_remove: set = stats_module.get_scheduled_not_completed(
        )
        if len(metadata_samples_to_remove) > 1100:
            send_message(
                f"COGUK importer can have a bug. {len(metadata_samples_to_remove)} out of "
                f"{len(id_new_sequences)} failed.")
        pipeline_event.added_items = pipeline_event.added_items - len(
            metadata_samples_to_remove)
        if len(metadata_samples_to_remove) > 0:
            logger.info(
                f"Removing metadata leftovers of imports that failed during variant/annotation calling or metadata"
                f" ({len(metadata_samples_to_remove)} samples)")

            metadata_samples_to_remove_as_string: list = [
                str(x) for x in metadata_samples_to_remove
            ]
            logger.trace("Accession id of failed imports:\n"
                         f"{metadata_samples_to_remove_as_string}")
            logger.info("Deleting leftovers in database")
            database.try_py_function(vcm.remove_sequence_and_meta_list,
                                     primary_sequence_accession_id=
                                     metadata_samples_to_remove_as_string)
    except:
        logger.exception(
            "Removal of metadata leftovers in the DB of the samples that failed was not successful."
        )

    database.try_py_function(vcm.insert_data_update_pipeline_event,
                             pipeline_event)