def delete(self, request, key):
     db = database.get_or_create(self.media)
     if key not in db:
         return Response(status.HTTP_404_NOT_FOUND)
     else:
         del db[key]
         return True
    def post(self, request):
        db = database.get_or_create(self.media)

        for m in self.CONTENT:
            ID = hashlib.sha256(json.dumps(m)).hexdigest()
            if ID not in db:
                db[ID] = m

        return Response(status.HTTP_201_CREATED)
Beispiel #3
0
    def set(self, name):
        value = request.form['value']
        goto = request.form.get(
            'goto',
            url_for('ContentManagementSystem:settings')
        )
        setting, is_created = get_or_create(Setting, name=name)
        if is_created:
            db.session.add(setting)
        setting.value = value

        db.session.commit()
        return redirect(goto)
Beispiel #4
0
    def save_settings(self):
        goto = request.form.get('goto',
                                url_for('ContentManagementSystem:settings'))
        for name, value in request.form.items():
            if name.startswith('setting['):
                name = name[8:-1]
                setting, is_created = get_or_create(Setting, name=name)
                if is_created:
                    db.session.add(setting)
                setting.value = value

                db.session.commit()
        return redirect(goto)
Beispiel #5
0
    def add_references_by_uniprot(data):

        full_uniprot, ref_type, value = data

        if '-' in full_uniprot:
            uniprot, isoform = full_uniprot.split('-')
            uniprot_tied_references = references.get(uniprot, None)
            if not uniprot_tied_references:
                return

            relevant_references = []
            # select relevant references:
            for reference in uniprot_tied_references:
                if any(entry.isoform == int(isoform)
                       for entry in reference.uniprot_entries):
                    relevant_references.append(reference)

        else:
            uniprot_tied_references = references.get(full_uniprot, None)
            if not uniprot_tied_references:
                return
            relevant_references = uniprot_tied_references

        if ref_type == 'UniProtKB-ID':
            # http://www.uniprot.org/help/entry_name
            # "Each >reviewed< entry is assigned a unique entry name upon integration into UniProtKB/Swiss-Prot"
            # Entry names comes in format: X_Y;
            # - for Swiss-Prot entry X is a mnemonic protein identification code (at most 5 characters)
            # - for TrEMBL entry X is the same as accession code (6 to 10 characters)
            x, y = value.split('_')

            if len(x) <= 5:
                for reference in relevant_references:
                    assert '-' not in full_uniprot
                    entry = UniprotEntry.query.filter_by(
                        accession=full_uniprot, reference=reference).one()
                    entry.reviewed = True

            return

        if ref_type in ensembl_references_to_collect:

            attr = ensembl_references_to_collect[ref_type]

            for relevant_reference in relevant_references:
                attrs = {'reference': relevant_reference, attr: value}

                peptide, new = get_or_create(EnsemblPeptide, **attrs)

                if new:
                    db.session.add(peptide)
    def add_uniprot_accession(data):

        # full uniprot includes isoform (if relevant)
        full_uniprot, ref_type, value = data

        if ref_type == 'RefSeq_NT':
            # get protein
            refseq_nm = value.split('.')[0]

            if not refseq_nm or not refseq_nm.startswith(
                    'NM') or not full_uniprot:
                return

            try:
                protein = Protein.query.filter_by(refseq=refseq_nm).one()
            except NoResultFound:
                return

            try:
                uniprot, isoform = full_uniprot.split('-')
                isoform = int(isoform)
            except ValueError:
                # only one isoform ?
                # print('No isoform specified for', full_uniprot, refseq_nm)
                uniprot = full_uniprot
                isoform = 1

            reference, new = get_or_create(ProteinReferences, protein=protein)
            uniprot_entry, new_uniprot = get_or_create(UniprotEntry,
                                                       accession=uniprot,
                                                       isoform=isoform)
            reference.uniprot_entries.append(uniprot_entry)
            references[uniprot].append(reference)

            if new:
                db.session.add(reference)
            if new_uniprot:
                db.session.add(uniprot_entry)
Beispiel #7
0
    def __init__(self):

        print(f'Preparing {self.source_name} sites importer...')

        self.issues_counter = Counter()
        # caching proteins and kinases allows for much faster
        # import later on, though it takes some time to cache
        self.known_kinases = create_key_model_dict(Kinase,
                                                   'name',
                                                   lowercase=True)
        self.known_groups = create_key_model_dict(KinaseGroup,
                                                  'name',
                                                  lowercase=True)
        self.known_sites = create_key_model_dict(
            Site, ['protein_id', 'position', 'residue'],
            options=(joinedload(Site.sources).joinedload('*')))
        self.proteins = create_key_model_dict(
            Protein,
            'refseq',
            options=(load_only('refseq', 'sequence', 'id').joinedload(
                Protein.gene).joinedload(Gene.isoforms).load_only('refseq')))

        # create site types
        site_type_objects = [
            get_or_create(SiteType, name=name) for name in set(self.site_types)
        ]

        self.novel_site_types = [
            site_type for site_type, new in site_type_objects if new
        ]
        self.site_types_map = {
            site_type.name: site_type
            for site_type, new in site_type_objects
        }

        self.source, _ = get_or_create(SiteSource, name=self.source_name)

        print(f'{self.source_name} importer ready.')
        def parser(line):
            gene_name, p_value, fdr = line
            p_value = float(p_value)
            fdr = float(fdr)

            nonlocal to_high_fdr_count

            if fdr >= fdr_cutoff:
                to_high_fdr_count += 1
                return

            gene, created = get_or_create(Gene, name=gene_name)

            entry = GeneListEntry(gene=gene, p=p_value, fdr=fdr)
            list_entries.append(entry)
    def get_genomic_muts(self, chrom, dna_pos, dna_ref,
                         dna_alt) -> List['SearchResult']:
        """Returns aminoacid mutations meeting provided criteria.

        There may be several mutations with the same genomic coordinates and alleles,
        as there are many splicing isoforms produced from a single gene.

        Stop codon mutations are not considered.

        Args:
            chrom: chromosome number or identifier, without 'chr' prefix
            dna_pos: genomic position
            dna_ref: reference allele
            dna_alt: alternative allele

        Returns:
            list of items where each item contains Mutation object and additional metadata
        """
        from search.mutation_result import SearchResult

        from models import Protein, Mutation
        from database import get_or_create

        snv = make_snv_key(chrom, dna_pos, dna_ref, dna_alt)

        items = [decode_csv(item) for item in self[snv]]

        # this could be speed up by: itemgetters, accumulative queries and so on
        results = []

        for item in items:

            protein = Protein.query.get(item['protein_id'])
            mutation, created = get_or_create(
                Mutation,
                protein=protein,
                protein_id=protein.
                id,  # TODO: should use either protein or protein_id
                position=item['pos'],
                alt=item['alt'])
            results.append(
                SearchResult(protein=protein,
                             mutation=mutation,
                             is_mutation_novel=created,
                             type='genomic',
                             **item))

        return results
Beispiel #10
0
    def __setstate__(self, state):

        state['protein'] = Protein.query.filter_by(
            refseq=state['protein_refseq']
        ).one()
        del state['protein_refseq']

        state['mutation'], created = get_or_create(
            Mutation,
            protein=state['protein'],
            **state['mutation_kwargs']
        )
        del state['mutation_kwargs']

        state['meta_user'].mutation = state['mutation']
        state['mutation'].meta_user = state['meta_user']

        self.__dict__.update(state)
    def add_ncbi_mappings(data):
        # 9606    3329    HSPD1   NG_008915.1     NM_199440.1     NP_955472.1     reference standard
        taxonomy, entrez_id, gene_name, refseq_gene, lrg, refseq_nucleotide, t, refseq_peptide, p, category = data

        refseq_nm = refseq_nucleotide.split('.')[0]

        if not refseq_nm or not refseq_nm.startswith('NM'):
            return

        try:
            protein = Protein.query.filter_by(refseq=refseq_nm).one()
        except NoResultFound:
            return

        reference, new = get_or_create(ProteinReferences, protein=protein)

        if new:
            db.session.add(reference)

        reference.refseq_np = refseq_peptide.split('.')[0]
        reference.refseq_ng = refseq_gene.split('.')[0]
        gene = protein.gene

        if gene.name != gene_name:
            print(
                f'Gene name mismatch for RefSeq mappings: {gene.name} vs {gene_name}'
            )

        entrez_id = int(entrez_id)

        if gene.entrez_id:
            if gene.entrez_id != entrez_id:
                print(
                    f'Entrez ID mismatch for isoforms of {gene.name} gene: {gene.entrez_id}, {entrez_id}'
                )
                if gene.name == gene_name:
                    print(
                        f'Overwriting {gene.entrez_id} entrez id with {entrez_id} for {gene.name} gene, '
                        f'because record with {entrez_id} has matching gene name'
                    )
                    gene.entrez_id = entrez_id
        else:
            gene.entrez_id = entrez_id
Beispiel #12
0
    def save_text_entry(self):
        name = request.form['entry_id']
        new_content = request.form['new_content']

        text_entry, created = get_or_create(TextEntry, name=name)
        if created:
            db.session.add(text_entry)

        status = 200
        text_entry.content = new_content
        try:
            db.session.commit()
        except (IntegrityError, OperationalError) as e:
            print(e)
            db.session.rollback()
            status = 501

        result = {
            'status': status,
            'content': substitute_variables(text_entry.content)
        }
        return jsonify(result)
Beispiel #13
0
    def save_inline_help(self):
        name = request.form['entry_id']
        old_content = request.form.get('old_content', None)
        new_content = request.form['new_content']

        help_entry, created = get_or_create(HelpEntry, name=name)
        if created:
            db.session.add(help_entry)

        if created or help_entry.content == old_content:
            status = 200
            help_entry.content = new_content
            try:
                db.session.commit()
            except (IntegrityError, OperationalError) as e:
                print(e)
                db.session.rollback()
                status = 501
        else:
            status = 409

        result = {'status': status, 'content': help_entry.content}
        return jsonify(result)
Beispiel #14
0
    def parse(self, path):

        mutations = defaultdict(lambda: [0, set()])

        for line in iterate_tsv_gz_file(path, file_header=self.header):
            cancer_name, sample_name = self.decode_line(line)

            if sample_name in self.samples_to_skip:
                continue

            cancer, created = get_or_create(Cancer, name=cancer_name)

            if created:
                db.session.add(cancer)

            for mutation_id in self.preparse_mutations(line):

                key = (mutation_id, cancer.id)

                mutations[key][0] += 1
                mutations[key][1].add(sample_name)

        return mutations
def sites_motifs(data=None):

    motifs_data = [
        # site_type_name, name, pattern (Python regular expression), sequences for pseudo logo

        # https://prosite.expasy.org/PDOC00001
        [
            'N-glycosylation', 'N-linked', '.{7}N[^P][ST].{5}',
            [
                ' ' * 7 + f'N{aa}{st}' + ' ' * 5 for aa in aa_symbols
                if aa != 'P' for st in 'ST'
            ]
        ],
        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4721579/
        [
            'N-glycosylation', 'N-linked - atypical', '.{7}N[^P][CV].{5}',
            [
                ' ' * 7 + f'N{aa}{cv}' + ' ' * 5 for aa in aa_symbols
                if aa != 'P' for cv in 'CV'
            ]
        ],

        # Based on https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1301293/
        [
            'O-glycosylation', 'O-linked TAPP', '.{7}TAPP',
            [' ' * 7 + 'TAPP' + ' ' * 5]
        ],
        [
            'O-glycosylation', 'O-linked TSAP', '.{7}TSAP',
            [' ' * 7 + 'TSAP' + ' ' * 5]
        ],
        [
            'O-glycosylation', 'O-linked TV.P', '.{7}TV.P',
            [' ' * 7 + 'TV.P' + ' ' * 5]
        ],
        [
            'O-glycosylation', 'O-linked [ST]P.P', '.{7}[ST]P.P',
            [' ' * 7 + f'{st}P P' + ' ' * 5 for st in 'ST']
        ],

        # https://www.uniprot.org/help/carbohyd
        [
            'C-glycosylation', 'C-linked W..W', '.{7}W..W.{4}',
            [' ' * 7 + 'W  W' + ' ' * 4]
        ],
        [
            'C-glycosylation', 'C-linked W..W', '.{4}W..W.{7}',
            [' ' * 4 + 'W  W' + ' ' * 7]
        ],
        [
            'C-glycosylation', 'C-linked W[ST].C', '.{7}W[ST].C.{4}',
            [' ' * 7 + f'W{st} C' + ' ' * 4 for st in 'ST']
        ],
    ]

    if data:
        motifs_data = data

    new_motifs = []

    for site_type_name, name, pattern, sequences in motifs_data:
        site_type, _ = get_or_create(SiteType, name=site_type_name)
        motif, new = get_or_create(SiteMotif,
                                   name=name,
                                   pattern=pattern,
                                   site_type=site_type)

        if new:
            new_motifs.append(motif)
            db.session.add(motif)

        motif.generate_pseudo_logo(sequences)

    return new_motifs
Beispiel #16
0
        def clinvar_parser(line):
            nonlocal highest_disease_id, duplicates

            try:
                at_least_one_significant_sub_entry, values, sub_entries_cnt, names, statuses, significances = self.parse_metadata(
                    line)
            except MalformedRowError as e:
                print(str(e) + '\n' + line)
                return False

            # following 2 lines are result of issue #47 - we don't import those
            # clinvar mutations that do not have any diseases specified:
            if not at_least_one_significant_sub_entry:
                return

            for mutation_id in self.get_or_make_mutations(line):

                # take care of duplicates
                duplicated = self.look_after_duplicates(
                    mutation_id, clinvar_mutations, values[:4])
                if duplicated:
                    duplicates += 1
                    continue

                # take care of nearly-duplicates
                same_mutation_pointers = self.mutations_details_pointers_grouped_by_unique_mutations[
                    mutation_id]
                assert len(same_mutation_pointers) <= 1
                if same_mutation_pointers:
                    pointer = same_mutation_pointers[0]
                    old = self.data_as_dict(clinvar_mutations[pointer])
                    new = self.data_as_dict(values, mutation_id=mutation_id)

                    if old['db_snp_ids'] != [new['db_snp_ids']]:
                        clinvar_mutations[pointer][1].append(new['db_snp_ids'])

                    # if either of the dbSNP entries is validated, the mutation is validated
                    # (the same with presence in PubMed)
                    for key in ['is_validated', 'is_in_pubmed_central']:
                        if old[key] != new[key] and new[key]:
                            index = self.insert_keys.index(key)
                            clinvar_mutations[pointer][index] = True

                    print(
                        'Merged details referring to the same mutation (%s): %s into %s'
                        % (mutation_id, values, clinvar_mutations[pointer]))
                    continue

                self.protect_from_duplicates(mutation_id, clinvar_mutations)

                # Python 3.5 makes it easy: **values (but is not available)
                clinvar_mutations.append([
                    mutation_id,
                    [values[0]],
                    values[1],
                    values[2],
                    values[3],
                ])

                for i in range(sub_entries_cnt):
                    name = names[i]

                    # we don't won't _uninteresting_ data
                    if name in ('not_specified', 'not provided'):
                        continue

                    if name in new_diseases:
                        disease_id = new_diseases[name]
                    else:
                        disease, created = get_or_create(Disease, name=name)
                        if created:
                            highest_disease_id += 1
                            new_diseases[name] = highest_disease_id
                            disease_id = highest_disease_id
                        else:
                            disease_id = disease.id

                    clinvar_data.append((
                        len(clinvar_mutations),
                        int(significances[i])
                        if significances is not None else None,
                        disease_id,
                        statuses[i] if statuses else None,
                    ))
Beispiel #17
0
def external_references(path='data/HUMAN_9606_idmapping.dat.gz',
                        refseq_lrg='data/LRG_RefSeqGene',
                        refseq_link='data/refseq_link.tsv.gz'):
    from models import Protein
    from models import ProteinReferences
    from models import EnsemblPeptide
    from sqlalchemy.orm.exc import NoResultFound

    references = defaultdict(list)

    def add_uniprot_accession(data):

        # full uniprot includes isoform (if relevant)
        full_uniprot, ref_type, value = data

        if ref_type == 'RefSeq_NT':
            # get protein
            refseq_nm = value.split('.')[0]

            if not refseq_nm or not refseq_nm.startswith(
                    'NM') or not full_uniprot:
                return

            try:
                protein = Protein.query.filter_by(refseq=refseq_nm).one()
            except NoResultFound:
                return

            try:
                uniprot, isoform = full_uniprot.split('-')
                isoform = int(isoform)
            except ValueError:
                # only one isoform ?
                # print('No isoform specified for', full_uniprot, refseq_nm)
                uniprot = full_uniprot
                isoform = 1

            reference, new = get_or_create(ProteinReferences, protein=protein)
            uniprot_entry, _ = get_or_create(UniprotEntry,
                                             accession=uniprot,
                                             isoform=isoform)
            reference.uniprot_entries.append(uniprot_entry)
            references[uniprot].append(reference)

            if new:
                db.session.add(reference)

    ensembl_references_to_collect = {'Ensembl_PRO': 'peptide_id'}

    def add_references_by_uniprot(data):

        full_uniprot, ref_type, value = data

        if '-' in full_uniprot:
            uniprot, isoform = full_uniprot.split('-')
            uniprot_tied_references = references.get(uniprot, None)
            if not uniprot_tied_references:
                return

            relevant_references = []
            # select relevant references:
            for reference in uniprot_tied_references:
                if any(entry.isoform == int(isoform)
                       for entry in reference.uniprot_entries):
                    relevant_references.append(reference)

        else:
            uniprot_tied_references = references.get(full_uniprot, None)
            if not uniprot_tied_references:
                return
            relevant_references = uniprot_tied_references

        if ref_type == 'UniProtKB-ID':
            # http://www.uniprot.org/help/entry_name
            # "Each >reviewed< entry is assigned a unique entry name upon integration into UniProtKB/Swiss-Prot"
            # Entry names comes in format: X_Y;
            # - for Swiss-Prot entry X is a mnemonic protein identification code (at most 5 characters)
            # - for TrEMBL entry X is the same as accession code (6 to 10 characters)
            x, y = value.split('_')

            if len(x) <= 5:
                for reference in relevant_references:
                    assert '-' not in full_uniprot
                    entry = UniprotEntry.query.filter_by(
                        accession=full_uniprot, reference=reference).one()
                    entry.reviewed = True

            return

        if ref_type in ensembl_references_to_collect:

            attr = ensembl_references_to_collect[ref_type]

            for relevant_reference in relevant_references:
                attrs = {'reference': relevant_reference, attr: value}

                peptide, new = get_or_create(EnsemblPeptide, **attrs)

                if new:
                    db.session.add(peptide)

    def add_ncbi_mappings(data):
        # 9606    3329    HSPD1   NG_008915.1     NM_199440.1     NP_955472.1     reference standard
        taxonomy, entrez_id, gene_name, refseq_gene, lrg, refseq_nucleotide, t, refseq_peptide, p, category = data

        refseq_nm = refseq_nucleotide.split('.')[0]

        if not refseq_nm or not refseq_nm.startswith('NM'):
            return

        try:
            protein = Protein.query.filter_by(refseq=refseq_nm).one()
        except NoResultFound:
            return

        reference, new = get_or_create(ProteinReferences, protein=protein)

        if new:
            db.session.add(reference)

        reference.refseq_np = refseq_peptide.split('.')[0]
        reference.refseq_ng = refseq_gene.split('.')[0]
        gene = protein.gene

        if gene.name != gene_name:
            print('Gene name mismatch for RefSeq mappings: %s vs %s' %
                  (gene.name, gene_name))

        entrez_id = int(entrez_id)

        if gene.entrez_id:
            if gene.entrez_id != entrez_id:
                print('Entrez ID mismatch for isoforms of %s gene: %s, %s' %
                      (gene.name, gene.entrez_id, entrez_id))
                if gene.name == gene_name:
                    print(
                        'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name'
                        % (gene.entrez_id, entrez_id, gene.name, entrez_id))
                    gene.entrez_id = entrez_id
        else:
            gene.entrez_id = entrez_id

    parse_tsv_file(refseq_lrg,
                   add_ncbi_mappings,
                   file_header=[
                       '#tax_id', 'GeneID', 'Symbol', 'RSG', 'LRG', 'RNA', 't',
                       'Protein', 'p', 'Category'
                   ])

    # add mappings retrieved from UCSC tables for completeness
    header = [
        '#name', 'product', 'mrnaAcc', 'protAcc', 'geneName', 'prodName',
        'locusLinkId', 'omimId'
    ]
    for line in iterate_tsv_gz_file(refseq_link, header):
        gene_name, protein_full_name, refseq_nm, refseq_peptide, _, _, entrez_id, omim_id = line

        if not refseq_nm or not refseq_nm.startswith('NM'):
            continue

        try:
            protein = Protein.query.filter_by(refseq=refseq_nm).one()
        except NoResultFound:
            continue

        gene = protein.gene

        if gene.name != gene_name:
            print('Gene name mismatch for RefSeq mappings: %s vs %s' %
                  (gene.name, gene_name))

        entrez_id = int(entrez_id)

        if protein_full_name:
            if protein.full_name:
                if protein.full_name != protein_full_name:
                    print(
                        'Protein full name mismatch: %s vs %s for %s' %
                        (protein.full_name, protein_full_name, protein.refseq))
                continue
            protein.full_name = protein_full_name

        if gene.entrez_id:
            if gene.entrez_id != entrez_id:
                print('Entrez ID mismatch for isoforms of %s gene: %s, %s' %
                      (gene.name, gene.entrez_id, entrez_id))
                if gene.name == gene_name:
                    print(
                        'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name'
                        % (gene.entrez_id, entrez_id, gene.name, entrez_id))
                    gene.entrez_id = entrez_id
        else:
            gene.entrez_id = entrez_id

        if refseq_peptide:
            reference, new = get_or_create(ProteinReferences, protein=protein)

            if new:
                db.session.add(reference)

            if reference.refseq_np and reference.refseq_np != refseq_peptide:
                print(
                    'Refseq peptide mismatch between LRG and UCSC retrieved data: %s vs %s for %s'
                    % (reference.refseq_np, refseq_peptide, protein.refseq))

            reference.refseq_np = refseq_peptide

    parse_tsv_file(path,
                   add_uniprot_accession,
                   file_opener=gzip.open,
                   mode='rt')
    parse_tsv_file(path,
                   add_references_by_uniprot,
                   file_opener=gzip.open,
                   mode='rt')

    return [
        reference for reference_group in references.values()
        for reference in reference_group
    ]
Beispiel #18
0
def insert_data(db):

    # "Board Code (200 identifies HR within the department)","License Type Code (see tables below)","Licensee Name","Rank Code","Modifier Code (see tables below)","Mailing Name (if different from Licensee Name)","Mailing Street Address Line 1","Mailing Address Line 2","Mailing Address Line 3","Mailing City","Mailing State","Mailing Zip Code","Mailing County Code (see table below)","Primary Telephone Number",
    # "Business Name (Location)",Filler,"Location Street Address Line 1","Location Address Line 2","Location Address Line 3","Location City","Location State","Location Zip Code","Location County Code (see table below)","Secondary Telephone Number",District,Region,"License Number","Primary Status Code (see table below)","Secondary Status Code (see table below)","Expiry Date","Last Inspection Date",
    # "Base Risk","Secondary Risk",
    # Latitude,Longitude,"Accuracy Score","Accuracy Type",Number,Street,City,State,County,Zip,Country

    # 20 Primary Status Code == Current

    ad = geo_coords.PostalAddress
    gc = geo_coords.GeoCoords
    org = organization.Organization

    for i in xrange(1, 8):
        print "hrlodge%d.csv" % i

        rd = csv.DictReader(open("hrlodge%d.csv" % i, "r+"))
        for row in rd:
            code = row['Rank Code']
            if code == "HOTL":
                m = hotels.Hotel()
            elif code == "MOTL":
                m = hotels.Motel()
            elif code == 'BNB':
                m = hotels.BedAndBreakfast()
            elif code == "CNDO":
                m = hotels.Resort()
            elif code == "DWEL":
                m = hotels.Resort()
            elif code == "TAPT":
                m = hotels.Resort()
            else:
                continue

            m.name = row['Business Name (Location)'].decode('utf-8').encode(
                'ascii', 'xmlcharrefreplace')
            m.license_number = row['License Number']
            dt = row['Expiry Date'].split("/")
            m.license_expiry = datetime.date(int(dt[2]), int(dt[0]),
                                             int(dt[1]))
            dt = row['Last Inspection Date'].split("/")
            if len(dt) == 3:
                m.last_inspection = datetime.date(int(dt[2]), int(dt[0]),
                                                  int(dt[1]))

            m.status_code = row['Primary Status Code (see table below)']
            m.rooms = row[
                'Number of Seats (food service) or Rental Units (lodging)']

            m.telephone = row['Primary Telephone Number']

            m.address, isFound = get_or_create(
                db.session,
                ad,
                streetAddress=row['Location Street Address Line 1'],
                streetAddress2=row['Location Address Line 2'],
                streetAddress3=row['Location Address Line 3'],
                addressLocality=row['Location City'],
                addressRegion=row['Location State'],
                postalCode=row['Location Zip Code'])

            # Latitude,Longitude,"Accuracy Score","Accuracy Type",Number,Street,City,State,County,Zip,Country

            # XXX limit search to name and state
            morg, isFound = get_or_create(
                db.session,
                org,
                name=row['Licensee Name'].decode('utf-8').encode(
                    'ascii', 'xmlcharrefreplace'))
            if not isFound:
                morg.address, isFound = get_or_create(
                    db.session,
                    ad,
                    streetAddress=row['Mailing Street Address Line 1'],
                    streetAddress2=row['Mailing Address Line 2'],
                    streetAddress3=row['Mailing Address Line 3'],
                    addressLocality=row['Mailing City'],
                    addressRegion=row['Mailing State'],
                    postalCode=row['Mailing Zip Code'])
                #org.telephone =

            m.organization = morg

            db.session.add(m)
            db.session.add(morg)
            db.session.commit()

    db.session.commit()
Beispiel #19
0
        def clinvar_parser(line):
            nonlocal highest_disease_id, duplicates

            metadata = line[20].split(';')

            clinvar_entry = make_metadata_ordered_dict(clinvar_keys, metadata)

            names, statuses, significances = (
                (entry.replace('|', ',').split(',') if entry else None)
                for entry in
                (
                    clinvar_entry[key]
                    for key in ('CLNDBN', 'CLNREVSTAT', 'CLNSIG')
                )
            )

            # those length should be always equal if they exists
            sub_entries_cnt = max(
                [
                    len(x)
                    for x in (names, statuses, significances)
                    if x
                ] or [0]
            )

            at_least_one_significant_sub_entry = False

            for i in range(sub_entries_cnt):

                try:
                    if names:
                        if names[i] not in ('not_specified', 'not provided'):
                            names[i] = self._beautify_disease_name(names[i])
                            at_least_one_significant_sub_entry = True
                    if statuses and statuses[i] == 'no_criteria':
                        statuses[i] = None
                except IndexError:
                    print('Malformed row (wrong count of subentries) on %s-th entry:' % i)
                    print(line)
                    return False

            values = list(clinvar_entry.values())

            # following 2 lines are result of issue #47 - we don't import those
            # clinvar mutations that do not have any diseases specified:
            if not at_least_one_significant_sub_entry:
                return

            for mutation_id in self.preparse_mutations(line):

                # take care of duplicates
                duplicated = self.look_after_duplicates(mutation_id, clinvar_mutations, values[:4])
                if duplicated:
                    duplicates += 1
                    continue

                # take care of nearly-duplicates
                same_mutation_pointers = self.mutations_details_pointers_grouped_by_unique_mutations[mutation_id]
                assert len(same_mutation_pointers) <= 1
                if same_mutation_pointers:
                    pointer = same_mutation_pointers[0]
                    old = self.data_as_dict(clinvar_mutations[pointer])
                    new = self.data_as_dict(values, mutation_id=mutation_id)

                    if old['db_snp_ids'] != [new['db_snp_ids']]:
                        clinvar_mutations[pointer][1].append(new['db_snp_ids'])

                    # if either of the dbSNP entries is validated, the mutation is validated
                    # (the same with presence in PubMed)
                    for key in ['is_validated', 'is_in_pubmed_central']:
                        if old[key] != new[key] and new[key]:
                            index = self.insert_keys.index(key)
                            clinvar_mutations[pointer][index] = True

                    print(
                        'Merged details referring to the same mutation (%s): %s into %s'
                        %
                        (mutation_id, values, clinvar_mutations[pointer])
                    )
                    continue

                self.protect_from_duplicates(mutation_id, clinvar_mutations)

                # Python 3.5 makes it easy: **values (but is not available)
                clinvar_mutations.append(
                    [
                        mutation_id,
                        [values[0]],
                        values[1],
                        values[2],
                        values[3],
                    ]
                )

                for i in range(sub_entries_cnt):
                    name = names[i]

                    # we don't won't _uninteresting_ data
                    if name in ('not_specified', 'not provided'):
                        continue

                    if name in new_diseases:
                        disease_id = new_diseases[name]
                    else:
                        disease, created = get_or_create(Disease, name=name)
                        if created:
                            highest_disease_id += 1
                            new_diseases[name] = highest_disease_id
                            disease_id = highest_disease_id
                        else:
                            disease_id = disease.id

                    clinvar_data.append(
                        (
                            len(clinvar_mutations),
                            int(significances[i]) if significances is not None else None,
                            disease_id,
                            statuses[i] if statuses else None,
                        )
                    )