Esempio n. 1
0
def addLociDataFromFiles(msi_spl, in_loci_list, method_name, keys, res_cls):
    """
    Get selected data for loci of a sample and add them as data in LocusRes.

    :param msi_spl: The sample where the results are added.
    :type msi_spl: MSISample
    :param in_loci_list: The path to the file containing the list of metrics files by locus (format: TSV). The header must be: #Locus_position<tab>Locus_name<tab>Filepath. Each file referenced in "Filepath" must be in JSON format and must contain a dictionary of metrics for one locus of the sample.
    :type in_loci_list: str
    :param method_name: The name of the method storing locus results in LocusRes.
    :type method_name: str
    :param keys: The keys extracted from in_locus_data and stored in LocusRes.
    :type keys: dict (keys are name in in_locus_data and values are names in LocusRes.data)
    :param res_cls: The class used to store LocusRes in msi_locus.
    :type res_cls: LocusRes or one of its subclasses
    """
    with HashedSVIO(in_loci_list) as FH_loci_list:
        for record in FH_loci_list:  # One file by locus
            # Add locus
            if record["Locus_position"] not in msi_spl.loci:
                msi_spl.addLocus(
                    MSILocus(record["Locus_position"], record["Locus_name"]))
            msi_locus = msi_spl.loci[record["Locus_position"]]
            # Add result and data
            addLociResult(msi_locus, record["Filepath"], method_name, keys,
                          res_cls)
Esempio n. 2
0
def getNbOccur(in_profile, nb_distinct_reads):
    """
    Return the number of future occurrences for each distinct reads.

    :param in_profile: Path to the file containing the percentage of distinct sequences by number of duplications (format: TSV). Header line must start with "#" and must contain "duplication_level" and "%_distinct".
    :type in_profile: str
    :param nb_distinct_reads: The duplication will be apply on this number of distinct reads.
    :type nb_distinct_reads: int
    :return: The number of future occurrences for each distinct reads.
    :rtype: list
    """
    # Get profile
    profile = None
    with HashedSVIO(in_profile, title_starter="#") as FH_in:
        profile = FH_in.read()
    # Get nb_occurences
    nb_occurences = []
    for category in profile:
        nb_reads_at_dup_lvl = int(
            round(float(category["%_distinct"]) * nb_distinct_reads / 100, 0))
        for idx in range(nb_reads_at_dup_lvl):
            nb_occurences.append(int(category["duplication_level"]))
    nb_missing = len(nb_occurences) - nb_distinct_reads
    for idx in range(nb_missing):
        nb_occurences.append(1)
    # Shuffle nb_occurences
    random.shuffle(nb_occurences)
    return nb_occurences
Esempio n. 3
0
def getVariantsProfile(profile_path, min_allele_freq=None):
    profiles = list()
    with HashedSVIO(profile_path, title_starter="#") as FH_profile:
        for record in FH_profile:  # Type   Occurence Freq_min    Freq_max    Lg_min  Lg_max
            curr_profile = {
                "type": record["Type"],
                "occurence": float(record["Occurence"]),
                "AF": {
                    "min": float(record["Freq_min"]),
                    "max": float(record["Freq_max"])
                },
                "length": {
                    "min": int(record["Lg_min"]),
                    "max": int(record["Lg_max"])
                }
            }
            profiles.append(curr_profile)
            if min_allele_freq is not None:
                if min_allele_freq > curr_profile["AF"]["min"]:
                    log.error(
                        "The minimum allele frequency in {} must be >= {}.".
                        format(profile_path, min_allele_freq))
                if curr_profile["AF"]["min"] != round(
                        curr_profile["AF"]["min"], int(1 / min_allele_freq)):
                    log.error(
                        "The allele frequency precision must be >= {}.".format(
                            min_allele_freq))
                if curr_profile["AF"]["max"] != round(
                        curr_profile["AF"]["max"], int(1 / min_allele_freq)):
                    log.error(
                        "The allele frequency precision must be >= {}.".format(
                            min_allele_freq))
    return profiles
Esempio n. 4
0
def getStatus(in_annotations, samples):
    """
    Return status by locus by sample.

    :param in_annotations: Path to the file containing status by locus by sample (format: TSV).
    :type in_annotations: str
    :param samples: List of samples names.
    :type samples: list
    :return: Status by locus by sample.
    :rtype: dict
    """
    status_by_spl = {}
    samples = set(samples)
    with HashedSVIO(in_annotations, title_starter="") as FH:
        for record in FH:
            spl_name = getSplFromLibName(record["sample"])
            if spl_name in samples:
                status_by_spl[spl_name] = {
                    key: value
                    for key, value in record.items()
                    if key not in ["sample", "sample_status"]
                }
                status_by_spl[spl_name]["sample"] = record["sample_status"]
    for spl in samples:
        if spl not in status_by_spl:
            raise Exception("Sample {} has no expected data.".format(spl))
    return status_by_spl
Esempio n. 5
0
def loadMitelman(db_path, db_version, fusions_by_partners, aliases_by_symbol, annotation_symbols):
    """
    Set fusions partners data from Mitelman database: MBCA.TXT.DATA,REF.TXT.DATA.

    :param db_path: Path to the Mitelman database MBCA.TXT.DATA,REF.TXT.DATA (format: TSV).
    :type db_path: str
    :param db_version: Database version to traceback sources in fusions_by_partners.
    :type db_version: str
    :param fusions_by_partners: By partners (upGene_@_downGene) the ids of fusions by source (chimerakb, cosmic, mitelman, pubmed, ...).
    :type fusions_by_partners: dict
    :param aliases_by_symbol: Gene name aliases by symbol.
    :type aliases_by_symbol: dict
    :param annotation_symbols: List of genes names known in genes annotations file.
    :type annotation_symbols: set
    """
    mbca_path = db_path
    pubmed_by_fusion = {}
    if "," in db_path:
        mbca_path, ref_path = db_path.split(",")
        pubmed_by_fusion = pubmedByFusion(ref_path)
    # MolClin    RefNo    InvNo    Morph    Topo    Immunology    GeneLength    GeneShort    GeneLong    KaryLength    KaryShort    KaryLong
    with HashedSVIO(mbca_path) as reader:
        for record in reader:
            if record["GeneShort"] != "":
                for fusion in record["GeneShort"].split(","):
                    if "/" in fusion:
                        genes = fusion.replace("+", "").split("/")  # PDRG1/ARF3/RUNX1 => fusion between 3 genes
                        for up_gene, down_gene in zip(genes, genes[1:]):  # For each breakpoint
                            found = False
                            try:
                                up_gene = selectAnnotSymbol(up_gene, annotation_symbols, aliases_by_symbol)
                                down_gene = selectAnnotSymbol(down_gene, annotation_symbols, aliases_by_symbol)
                                found = True
                            except Exception:
                                log.warning(
                                    "Error to parse gene names [{}, {}] from Mitelman (PMID: {}).".format(
                                        up_gene, down_gene, record["RefNo"]
                                    )
                                )
                            if found:
                                fusion_partners = "{}_@_{}".format(up_gene, down_gene)
                                source = "mitelman_{}".format(db_version)
                                if fusion_partners not in fusions_by_partners:
                                    fusions_by_partners[fusion_partners] = {source: set()}
                                if source not in fusions_by_partners[fusion_partners]:
                                    fusions_by_partners[fusion_partners][source] = set()
                                fusions_by_partners[fusion_partners][source].add(int(record["RefNo"]))
                                if "PMID" not in fusions_by_partners[fusion_partners]:
                                    fusions_by_partners[fusion_partners]["PMID"] = set()
                                    for pmid in pubmed_by_fusion[record["RefNo"]]:
                                        fusions_by_partners[fusion_partners]["PMID"].add(int(pmid))
Esempio n. 6
0
def process(args, log):
    """
    Convert MSI status file (splA<tab>status_locus_1<tab>status_locus_2) in MSI annotation file.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    :param log: The logger of the script.
    :type log: loggin.Logger
    """
    # Get targeted loci IDs and names
    loci_in_bed = []
    id_by_name = {}
    with BEDIO(args.input_targets) as FH_in:
        for record in FH_in:
            id = "{}:{}-{}".format(record.chrom, record.start - 1, record.end)
            id_by_name[record.name] = id
            loci_in_bed.append(id)
    if not args.loci_by_id:
        loci_in_bed = sorted(id_by_name.keys())
    # Write annotation file
    with HashedSVIO(args.input_status, title_starter="") as FH_in:
        loci_in_status = set([elt for elt in FH_in.titles if elt != "sample"])
        if len(set(loci_in_bed) - loci_in_status) > 0:
            msg = "The following loci are defined in targets but are missing from status file: {}".format(
                set(loci_in_status) - loci_in_status)
            log.error(msg)
            raise Exception(msg)
        with MSIAnnot(args.output_annotations, "w") as FH_out:
            for record in FH_in:
                for locus in loci_in_bed:
                    if record[locus] not in Status.authorizedValues():
                        msg = 'The status "{}" of the locus {} in sample {} is invalid. It must be: {}'.format(
                            record[locus], locus, record["sample"],
                            Status.authorizedValues())
                        log.error(msg)
                        raise Exception(msg)
                    FH_out.write({
                        "sample":
                        record["sample"],
                        "locus_position":
                        locus if args.loci_by_id else id_by_name[locus],
                        "method_id":
                        "model",
                        "key":
                        "status",
                        "value":
                        record[locus],
                        "type":
                        "str"
                    })
Esempio n. 7
0
def sourcesBySymbols(in_known):
    """
    Return sources descriptions by fusion ID from database.

    :param in_known: Path to the file containing known fusions (format: TSV). This file must contains 3 columns : 5prim_gene, 3_prim_gene and sources. 5prim_gene and 3prim_gene are symbol with the same master name of the name in GTF used for the annotation of breakends. sources is a string containing db1name:entryId,entryId|db2name:entryId (example: cosmic_91:1743,1745|chimerdb_pub-V4:3427,3428).
    :type in_known: str
    :return: sources descriptions (db1name:entryId,entryId|db2name:entryId) by fusion ID (5primSymbol_@_3primSymbol).
    :rtype: dict
    """
    sources_by_symbols = {}
    with HashedSVIO(in_known) as reader:
        for record in reader:
            fusion_id = "{}_@_{}".format(record["5prim_gene"],
                                         record["3prim_gene"])
            sources_by_symbols[fusion_id] = record["sources"]
    return sources_by_symbols
Esempio n. 8
0
def aliasesBySymbols(in_aliases):
    """
    Return all names aliases by each gene symbol.

    :param in_aliases: Path to genes synonyms (format: TSV).
    :type in_aliases: str
    :return: Names aliases by each gene symbol.
    :rtype: dict
    """
    is_ncbi = True
    with HashedSVIO(in_aliases) as reader:
        if "Gene name" in reader.titles and "Gene Synonym" in reader.titles:
            is_ncbi = False
    if is_ncbi:
        return aliasesBySymbolsFromNCBI(in_aliases)
    else:
        return aliasesBySymbolsFromEnsembl(in_aliases)
Esempio n. 9
0
def process(args):
    """
    Tag stability for loci and sample from length distribution on loci.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """
    spl_name = args.sample_name
    if args.sample_name is None:
        spl_name = os.path.basename(args.output_report).split(".")[0]
        if spl_name.endswith("_report"):
            spl_name = spl_name[:-7]
    msi_spl = MSISample(spl_name)
    # Parse lengths metrics by loci
    with HashedSVIO(args.input_combined_list) as FH_loci_list:
        for record in FH_loci_list:
            with open(record["Filepath"]) as FH_locus:
                locus_metrics = json.load(FH_locus)
            msi_locus = MSILocus.fromDict({
                "name": record["Locus_name"],
                "position": record["Locus_position"],
                "results": {
                    "PairsCombi": {
                        "_class": "LocusResPairsCombi",
                        "status": Status.none,
                        "data": {
                            "nb_by_length":
                            locus_metrics["nb_by_length"],
                            "nb_pairs_aligned":
                            locus_metrics["nb_uncombined_pairs"] +
                            locus_metrics["nb_combined_pairs"]
                        }
                    }
                }
            })
            msi_spl.addLocus(msi_locus)
    # Process status
    msi_models = MSIReport.parse(args.input_models)
    for locus_id in msi_spl.loci:
        processor = PairsCombiProcessor(locus_id, msi_models, [msi_spl],
                                        args.min_support)
        processor.setLocusStatus()
    msi_spl.setStatus("PairsCombi")
    # Write report
    MSIReport.write([msi_spl], args.output_report)
Esempio n. 10
0
def pubmedByFusion(in_ref):
    """
    Return Pubmed IDs by fusion partners from REF.TXT.DATA from Mitelman database.

    :param in_ref: Path to the REF.TXT.DATA from Mitelman database (format: TSV).
    :type in_ref: str
    :return: Pubmed IDs by fusion partners.
    :rtype: dict
    """
    # RefNo    TitleLength    TitleShort    TitleLong    Volume    Year    Journal    Text    Abbreviation    AuthorsLength    AuthorsShort    AuthorsLong    Flag    Pubmed
    pubmed_by_fusion = {}
    with HashedSVIO(in_ref) as reader:
        for record in reader:
            if record["RefNo"] not in pubmed_by_fusion:
                pubmed_by_fusion[record["RefNo"]] = set()
            if record["Pubmed"] != "":
                pubmed_by_fusion[record["RefNo"]].add(record["Pubmed"])
    return pubmed_by_fusion
Esempio n. 11
0
def writePartnersDb(db_path, fusions_by_partners):
    """
    Write known fusions partners database.

    :param db_path: Path to the fusions partners database (format: TSV).
    :type db_path: str
    :param fusions_by_partners: By partners (upGene_@_downGene) the ids of fusions by source (chimerakb, cosmic, mitelman, pubmed, ...).
    :type fusions_by_partners: dict
    """
    with HashedSVIO(db_path, "w") as writer:
        writer.titles = ["5prim_gene", "3prim_gene", "sources"]
        for partners, entries_by_src in fusions_by_partners.items():
            up_gene, down_gene = partners.split("_@_")
            sources = [src + ":" + ",".join([str(elt) for elt in sorted(ids)]) for src, ids in entries_by_src.items()]
            writer.write({
                "5prim_gene": up_gene,
                "3prim_gene": down_gene,
                "sources": "|".join(sources)
            })
Esempio n. 12
0
def loadGeneric(db_path, db_name, db_version, fusions_by_partners, aliases_by_symbol, annotation_symbols, up_title="up_gene", down_title="down_gene"):
    """
    Set fusions partners data from single source database.

    :param db_path: Path to fusions database (format: TSV).
    :type db_path: str
    :param db_name: Database name.
    :type db_name: str
    :param db_version: Database version to traceback sources in fusions_by_partners.
    :type db_version: str
    :param fusions_by_partners: By partners (upGene_@_downGene) the ids of fusions by source (Babiceanu, BodyMap, ...).
    :type fusions_by_partners: dict
    :param aliases_by_symbol: Gene name aliases by symbol.
    :type aliases_by_symbol: dict
    :param annotation_symbols: List of genes names known in genes annotations file.
    :type annotation_symbols: set
    :param up_title: Title of column containing gene name of first partner.
    :type up_title: str
    :param down_title: Title of column containing gene name of second partner.
    :type down_title: str
    """
    source = "{}_{}".format(db_name, db_version)
    with HashedSVIO(db_path) as reader:
        for record in reader:
            up_gene = None
            down_gene = None
            try:
                up_gene = selectAnnotSymbol(record[up_title], annotation_symbols, aliases_by_symbol)
                down_gene = selectAnnotSymbol(record[down_title], annotation_symbols, aliases_by_symbol)
            except Exception:
                log.warning(
                    "Error to parse gene names [{}, {}] from {}.".format(
                        record[up_title], record[down_title], db_name
                    )
                )
            if up_gene and down_gene:
                fusion_partners = "{}_@_{}".format(up_gene, down_gene)
                if fusion_partners not in fusions_by_partners:
                    fusions_by_partners[fusion_partners] = {source: set()}
                if source not in fusions_by_partners[fusion_partners]:
                    fusions_by_partners[fusion_partners][source] = set()
                fusions_by_partners[fusion_partners][source].add(fusion_partners)
Esempio n. 13
0
def loadCosmic(db_path, db_version, fusions_by_partners, aliases_by_symbol, annotation_symbols):
    """
    Set fusions partners data from cosmic database.

    :param db_path: Path to the cosmic database (format: TSV).
    :type db_path: str
    :param db_version: Database version to traceback sources in fusions_by_partners.
    :type db_version: str
    :param fusions_by_partners: By partners (upGene_@_downGene) the ids of fusions by source (chimerakb, cosmic, mitelman, pubmed, ...).
    :type fusions_by_partners: dict
    :param aliases_by_symbol: Gene name aliases by symbol.
    :type aliases_by_symbol: dict
    :param annotation_symbols: List of genes names known in genes annotations file.
    :type annotation_symbols: set
    """
    # Sample ID    Sample name    Primary site    Site subtype 1    Site subtype 2    Site subtype 3    Primary histology    Histology subtype 1    Histology subtype 2    Histology subtype 3    Fusion ID    Translocation Name    5'_CHROMOSOME    5'_GENOME_START_FROM    5'_GENOME_START_TO    5'_GENOME_STOP_FROM    5'_GENOME_STOP_TO    5'_STRAND    3'_CHROMOSOME    3'_GENOME_START_FROM    3'_GENOME_START_TO    3'_GENOME_STOP_FROM    3'_GENOME_STOP_TO    3'_STRAND    Fusion type    Pubmed_PMID
    with HashedSVIO(db_path) as reader:
        for record in reader:
            if record["Translocation Name"] != "":
                matches = re.fullmatch(r"ENS.+\((.+)\):.+_ENS.+\((.+)\):.+", record["Translocation Name"])  # ENST00000324093.4(PLXND1):r.1_2864_ENST00000393238.3(TMCC1):r.918_5992
                if matches is None:
                    log.warning(
                        "Error to parse gene names {} from cosmic (PMID: {}).".format(
                            record["Translocation Name"],
                            record["Pubmed_PMID"]
                        )
                    )
                else:
                    up_gene, down_gene = matches.groups()
                    up_gene = selectAnnotSymbol(up_gene, annotation_symbols, aliases_by_symbol)
                    down_gene = selectAnnotSymbol(down_gene, annotation_symbols, aliases_by_symbol)
                    fusion_partners = "{}_@_{}".format(up_gene, down_gene)
                    source = "cosmic_{}".format(db_version)
                    if fusion_partners not in fusions_by_partners:
                        fusions_by_partners[fusion_partners] = {source: set()}
                    if source not in fusions_by_partners[fusion_partners]:
                        fusions_by_partners[fusion_partners][source] = set()
                    fusions_by_partners[fusion_partners][source].add(int(record["Fusion ID"]))
                    if record["Pubmed_PMID"] != "":
                        if "PMID" not in fusions_by_partners[fusion_partners]:
                            fusions_by_partners[fusion_partners]["PMID"] = set()
                        fusions_by_partners[fusion_partners]["PMID"].add(int(record["Pubmed_PMID"]))
Esempio n. 14
0
def loadChimerdb(db_path, db_version, fusions_by_partners, aliases_by_symbol, annotation_symbols):
    """
    Set fusions partners data from chimerdb database.

    :param db_path: Path to the chimerdb database (format: TSV).
    :type db_path: str
    :param db_version: Database version to traceback sources in fusions_by_partners.
    :type db_version: str
    :param fusions_by_partners: By partners (upGene_@_downGene) the ids of fusions by source (chimerakb, cosmic, mitelman, pubmed, ...).
    :type fusions_by_partners: dict
    :param aliases_by_symbol: Gene name aliases by symbol.
    :type aliases_by_symbol: dict
    :param annotation_symbols: List of genes names known in genes annotations file.
    :type annotation_symbols: set
    """
    # id    Source    webSource    Fusion_pair    H_gene    H_chr    H_position    H_strand    T_gene    T_chr    T_position    T_strand    Breakpoint_Type    Genome_Build_Version    PMID    Disease    Validation    Kinase    Oncogene    Tumor_suppressor    Receptor    Transcription_Factor    ChimerPub    ChimerSeq
    with HashedSVIO(db_path) as reader:
        for record in reader:
            up_gene = None
            down_gene = None
            try:
                up_gene = selectAnnotSymbol(record["H_gene"], annotation_symbols, aliases_by_symbol)
                down_gene = selectAnnotSymbol(record["T_gene"], annotation_symbols, aliases_by_symbol)
            except Exception:
                log.warning(
                    "Error to parse gene names [{}, {}] from chimerDB (PMID: {}).".format(
                        record["H_gene"], record["T_gene"], record["PMID"]
                    )
                )
            if up_gene and down_gene:
                fusion_partners = "{}_@_{}".format(up_gene, down_gene)
                source = "chimerdb_{}".format(db_version)
                if fusion_partners not in fusions_by_partners:
                    fusions_by_partners[fusion_partners] = {source: set()}
                if source not in fusions_by_partners[fusion_partners]:
                    fusions_by_partners[fusion_partners][source] = set()
                fusions_by_partners[fusion_partners][source].add(int(record["id"]))
                if record["PMID"] != "":
                    if "PMID" not in fusions_by_partners[fusion_partners]:
                        fusions_by_partners[fusion_partners]["PMID"] = set()
                    pubmed_ids = set(map(int, record["PMID"].split(",")))
                    fusions_by_partners[fusion_partners]["PMID"] = fusions_by_partners[fusion_partners]["PMID"] | pubmed_ids
Esempio n. 15
0
def getGroupsData(groups_path,
                  samples,
                  sample_tag="Sample",
                  group_tag="Group",
                  separator="\t"):
    """
    @summary: Return group name by sample, samples by grop and samples without group from separated value file.
    @param groups_path: [str] Path to the separated value file describing links between samples and groups.
    @param samples: [lsit] The list of all samples.
    @param sample_tag: [str] The title of column used to store the samples names.
    @param group_tag: [str] The title of column used to store the groups names.
    @param separator: [str] The separator used between fields in the input file.
    @return: [list] The first element is a dictionary representing the group name by sample. the second element is a dictionary representing the list of samples by group name. The last element is the list of samples without group.
    """
    group_by_spl = {}
    spl_by_group = {}
    without_group = {}
    processed_by_spl = {spl: False for spl in samples}
    # Parse groups information
    with HashedSVIO(groups_path, separator=separator,
                    title_starter="#") as FH_gp:
        for record in FH_gp:
            sample = record[sample_tag]
            group = record[group_tag]
            processed_by_spl[sample] = True
            if sample not in processed_by_spl:
                raise Exception(
                    'The sample "{}" found in {} does not exist in expected samples.'
                    .format(sample, groups_path))
            group_by_spl[sample] = group
            if group in spl_by_group:
                spl_by_group[group].append(sample)
            else:
                spl_by_group[group] = [sample]
    # Store samples without group
    for spl, is_in_gp in processed_by_spl.items():
        if not is_in_gp:
            without_group[spl] = True
    # Return
    return group_by_spl, spl_by_group, without_group
Esempio n. 16
0
def getNoise(input_noise):
    """
    Return by variant id ("chrom:pos=ref/alt") the noise rate.

    :param input_noise: The path to the file containing artifactual variants with their maximum frequency (format: TSV). The header line of the file must be "#Chromosome<tab>Possition<tab>Reference_allele<tab>Alternative_allele<tab>Noise_rate".
    :type input_noise: str
    :return: By variant id ("chrom:pos=ref/alt") the noise rate.
    :rtype: dict
    """
    expected_titles = ["Chromosome", "Position", "Reference_allele", "Alternative_allele", "Noise_rate"]
    noise_by_var = dict()
    with HashedSVIO(input_noise, title_starter="#") as FH_noise:
        if FH_noise.titles != expected_titles:
            raise Exception(
                'The header line in "{}" does not correpond to "#{}".'.format(
                    input_noise, "\t".join(expected_titles)
                )
            )
        for record in FH_noise:
            variant_id = "{}:{}={}/{}".format(record["Chromosome"], record["Position"], record["Reference_allele"], record["Alternative_allele"])
            noise_by_var[variant_id] = float(record["Noise_rate"])
    return noise_by_var
Esempio n. 17
0
def aliasesBySymbolsFromEnsembl(in_aliases):
    """
    Return all names aliases by each gene symbol from Ensembl biomart export.

    :param in_aliases: Path to genes synonyms from Ensembl (format: TSV).
    :type in_aliases: str
    :return: Names aliases by each gene symbol.
    :rtype: dict
    """
    aliases_by_symbol = {}
    with HashedSVIO(in_aliases) as reader:
        for record in reader:
            name = record["Gene name"]
            alias = record["Gene Synonym"]
            if name not in aliases_by_symbol:
                aliases_by_symbol[name] = [name, alias]
            else:
                aliases_by_symbol[name].append(alias)
            if alias not in aliases_by_symbol:
                aliases_by_symbol[alias] = [alias, name]
            else:
                aliases_by_symbol[alias].append(name)
    return aliases_by_symbol
Esempio n. 18
0
def aliasesBySymbolsFromNCBI(in_aliases):
    """
    Return all names aliases by each gene symbol from NCBI RefSeq gene_info.

    :param in_aliases: Path to genes synonyms from gene_info (format: TSV).
    :type in_aliases: str
    :return: Names aliases by each gene symbol.
    :rtype: dict
    """
    aliases_by_symbol = {}
    with HashedSVIO(in_aliases) as reader:
        for record in reader:
            name = record["Symbol"]
            aliases = record["Synonyms"].split("|")
            if name not in aliases_by_symbol:
                aliases_by_symbol[name] = [name] + aliases
            else:
                aliases_by_symbol[name] += aliases
            for alias in aliases:
                if alias not in aliases_by_symbol:
                    aliases_by_symbol[alias] = [name] + aliases
                else:
                    aliases_by_symbol[alias] += [name]
    return aliases_by_symbol
Esempio n. 19
0
        help='Path to the merged variants file (format: VCF).')
    args = parser.parse_args()

    # Logger
    logging.basicConfig(
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Get accession by chromosome ID
    acc_by_chrom = {}
    if args.input_assembly_accessions:
        with HashedSVIO(args.input_assembly_accessions,
                        title_starter=None) as FH:
            for record in FH:
                acc_by_chrom[
                    record["sequence_id"]] = record["RefSeq_accession"]

    # Connect to HGVS mapper
    if args.input_sequence_repository is not None:
        os.environ["HGVS_SEQREPO_DIR"] = args.input_sequence_repository
    hgvs_mapper = getAssemblyMapper(args.assembly_version,
                                    args.input_UTA_config)

    # Write
    nb_records = {
        "analysed": 0,
        "fixed_HGVSg": 0,
        "fixed_HGVSc": 0,
    args = parser.parse_args()

    # Logger
    logging.basicConfig(format='%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s')
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Load annotations
    log.info("Load model from {}.".format(args.input_annotations))
    tr_by_id = {tr.annot["id"]: tr for tr in loadModel(args.input_annotations, "transcripts")}

    # Parse and convert domains data
    log.info("Parse and convert domains data from {}.".format(args.input_domains))
    domains_by_tr_id = dict()
    with HashedSVIO(args.input_domains) as reader:
        for record in reader:
            if record['Interpro ID'] != "":
                record['Interpro start'] = int(record['Interpro start'])
                record['Interpro end'] = int(record['Interpro end'])
                tr_id = record['Transcript stable ID version'].split(".", 1)[0]
                if tr_id not in tr_by_id:
                    log.warning("The transcript {} is missing in {}.".format(tr_id, args.input_annotations))
                else:
                    domain_id = record['Interpro ID']
                    # Get genomic coordinates
                    transcript = tr_by_id[tr_id]
                    protein = transcript.proteins[0]
                    if len(transcript.proteins) > 1:
                        msg = "The transcript {} is linked with several proteins {}.".format(tr_id, [prot.annot["id"] for prot in transcript.proteins])
                        log.error(msg)
Esempio n. 21
0
        default=".",
        help='Path to the output folder. [Default: %(default)s]')
    args = parser.parse_args()

    # Logger
    logging.basicConfig(
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] %(message)s'
    )
    log = logging.getLogger()
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Get status by locus
    status_by_spl = {}
    with HashedSVIO(args.input_status, title_starter="") as FH_in:
        for record in FH_in:
            status_by_spl[record["sample"]] = {
                locus: status
                for locus, status in record.items()
                if locus not in ["sample", "sample_status"]
            }

    # Get min and max amplicon size by locus
    range_by_locus = {}
    for filename in os.listdir(args.input_data):
        filepath = os.path.join(args.input_data, filename)
        report = MSIReport.parse(filepath)
        for spl in report:
            for locus_id, locus in spl.loci.items():
                if locus_id not in range_by_locus: