Example #1
0
def export_protein_sequences(SEQUENCE_PATH):
    """
    Exports amino acid sequences for protein_coding transcripts as gzipped fasta files to the desired path
    """
    if not os.path.exists(SEQUENCE_PATH):
        os.makedirs(SEQUENCE_PATH)

    species = Species.query.all()

    for s in species:
        filename = s.code + ".aa.fasta.gz"
        filename = os.path.join(SEQUENCE_PATH, filename)

        sequences = db.engine.execute(
            db.select([
                Sequence.__table__.c.name, Sequence.__table__.c.type,
                Sequence.__table__.c.coding_sequence
            ]).where(Sequence.__table__.c.species_id == s.id)).fetchall()

        with gzip.open(filename, 'wb') as f:
            for (name, sequence_type, sequence) in sequences:
                if sequence_type == "protein_coding":
                    f.write(
                        bytes(">" + name + '\n' + translate(sequence) + '\n',
                              'UTF-8'))
Example #2
0
    def generate(selected_species):
        sequences = db.engine.execute(db.select([Sequence.__table__.c.name, Sequence.__table__.c.coding_sequence]).
                                      where(Sequence.__table__.c.species_id == selected_species)).\
            fetchall()

        for name, coding_sequence in sequences:
            yield ">" + name + '\n' + coding_sequence + '\n'
Example #3
0
    def update_species_counts():
        """
        Adds phylo-profile to each go-label, results are stored in the database

        :param exclude_predicted: if True (default) predicted GO labels will be excluded
        """
        # link species to sequences
        sequences = db.engine.execute(db.select([Sequence.__table__.c.id, Sequence.__table__.c.species_id])).fetchall()

        sequence_to_species = {}
        for seq_id, species_id in sequences:
            if species_id is not None:
                sequence_to_species[seq_id] = int(species_id)

        # get go for all genes
        associations = db.engine.execute(
            db.select([SequenceGOAssociation.__table__.c.sequence_id,
                       SequenceGOAssociation.__table__.c.go_id], distinct=True)\
            .where(SequenceGOAssociation.__table__.c.predicted == 0))\
            .fetchall()

        count = {}
        for seq_id, go_id in associations:
            species_id = sequence_to_species[seq_id]

            if go_id not in count.keys():
                count[go_id] = {}

            if species_id not in count[go_id]:
                count[go_id][species_id] = 1
            else:
                count[go_id][species_id] += 1

        # update counts
        for go_id, data in count.items():
            db.engine.execute(db.update(GO.__table__)
                              .where(GO.__table__.c.id == go_id)
                              .values(species_counts=json.dumps(data)))
Example #4
0
    def calculate_specificities(species_id,
                                description,
                                remove_background=False):
        """
        Function that calculates condition specificities for each profile. No grouping is applied, each condition is
        used as is

        :param species_id: internal species ID
        :param description: description for the method to determine the specificity
        :param remove_background: when true the lowest value of each profile is substracted from all values (can be
        off use with noisy data derived from microarrays.
        """

        conditions = []

        # get profile from the database (ORM free for speed)
        profiles = db.engine.execute(
            db.select([
                ExpressionProfile.__table__.c.id,
                ExpressionProfile.__table__.c.profile
            ]).where(ExpressionProfile.__table__.c.species_id ==
                     species_id)).fetchall()

        # detect all conditions
        for profile_id, profile in profiles:
            profile_data = json.loads(profile)
            for condition in profile_data['order']:
                if condition not in conditions:
                    conditions.append(condition)

        # convert list into dictionary and run function
        conditions_dict = {k: k for k in conditions}
        return ExpressionSpecificityMethod.calculate_tissue_specificities(
            species_id,
            description,
            conditions_dict,
            conditions,
            remove_background=remove_background)
Example #5
0
def species_download_coding(species_id):
    """
    Generates a fasta file with all coding sequences for a given species

    :param species_id: Internal ID of the species
    :return: Response with the fasta file
    """
    output = []

    current_species = Species.query.get(species_id)
    sequences = db.engine.execute(db.select([Sequence.__table__.c.name, Sequence.__table__.c.coding_sequence]).
                                  where(Sequence.__table__.c.species_id == current_species.id)).\
        fetchall()

    for (name, coding_sequence) in sequences:
        output.append(">" + name)
        output.append(coding_sequence)

    response = make_response("\n".join(output))
    response.headers[
        "Content-Disposition"] = "attachment; filename=" + current_species.code + ".cds.fasta"
    response.headers['Content-type'] = 'text/plain'

    return response
Example #6
0
    def calculate_similarities(gene_family_method_id=1, percentile_pass=0.95):
        """
        This function will calculate ALL similarities between clusters in the database. Results will be added to the
        DB

        :param gene_family_method_id: Internal ID of gene family method to use to calculate the scores (default = 1)
        :param percentile_pass: percentile based cutoff (default = 0.95)
        """

        # sqlalchemy to fetch cluster associations
        fields = [
            SequenceCoexpressionClusterAssociation.__table__.c.sequence_id,
            SequenceCoexpressionClusterAssociation.__table__.c.
            coexpression_cluster_id
        ]
        condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None
        cluster_associations = db.engine.execute(
            db.select(fields).where(condition)).fetchall()

        # sqlalchemy to fetch sequence family associations
        fields = [
            SequenceFamilyAssociation.__table__.c.sequence_id,
            SequenceFamilyAssociation.__table__.c.gene_family_id,
            GeneFamily.__table__.c.method_id
        ]
        condition = GeneFamily.__table__.c.method_id == gene_family_method_id
        table = join(
            SequenceFamilyAssociation.__table__, GeneFamily.__table__,
            SequenceFamilyAssociation.__table__.c.gene_family_id ==
            GeneFamily.__table__.c.id)
        sequence_families = db.engine.execute(
            db.select(fields).select_from(table).where(condition)).fetchall()

        # convert sqlachemy results into dictionary
        sequence_to_family = {
            seq_id: fam_id
            for seq_id, fam_id, method_id in sequence_families
        }

        cluster_to_sequences = {}
        cluster_to_families = {}

        for seq_id, cluster_id in cluster_associations:
            if cluster_id not in cluster_to_sequences.keys():
                cluster_to_sequences[cluster_id] = []
            cluster_to_sequences[cluster_id].append(seq_id)

        for cluster_id, sequences in cluster_to_sequences.items():
            families = list(
                set([
                    sequence_to_family[s] for s in sequences
                    if s in sequence_to_family.keys()
                ]))
            if len(families) > 0:
                cluster_to_families[cluster_id] = families

        keys = list(cluster_to_families.keys())

        data = []

        for i in range(len(keys) - 1):
            for j in range(i + 1, len(keys)):
                current_keys = [keys[x] for x in [i, j]]
                current_families = [
                    cluster_to_families[k] for k in current_keys
                ]

                if len(current_families[0]) > 4 and len(
                        current_families[1]) > 4:
                    j = jaccard(current_families[0], current_families[1])
                    data.append([current_keys[0], current_keys[1], j])

        ordered_j = sorted([a[2] for a in data])
        if len(ordered_j) > 0:
            percentile_cutoff = ordered_j[int(
                len(ordered_j) * percentile_pass)]

            database = [{
                'source_id': d[0],
                'target_id': d[1],
                'gene_family_method_id': gene_family_method_id,
                'jaccard_index': d[2],
                'p_value': 0,
                'corrected_p_value': 0
            } for d in data if d[2] >= percentile_cutoff]

            db.engine.execute(CoexpressionClusterSimilarity.__table__.insert(),
                              database)
        else:
            print("No similar clusters found!")
Example #7
0
    def predict_from_network_enrichment(expression_network_method_id, cutoff=0.05, source="PlaNet Prediction"):
        from conekt.models.expression.networks import ExpressionNetworkMethod

        expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id)

        if expression_network_method is None:
            print("ERROR: Network Method ID %d not found" % expression_network_method_id)
            return

        probes = expression_network_method.probes.all()

        # Get all GO terms and get background
        # Important, counts are obtained from precomputed counts in the species_counts field !!
        go_data = db.engine.execute(db.select([GO.__table__.c.id, GO.__table__.c.species_counts])).fetchall()

        go_background = defaultdict(lambda: 0)

        for go_id, counts_json in go_data:
            if counts_json is not "":
                counts = json.loads(counts_json)
                if str(expression_network_method.species_id) in counts.keys():
                    go_background[go_id] = counts[str(expression_network_method.species_id)]

        new_associations = []

        for i, probe in enumerate(probes):
            print("Predicting GO for gene: %d, %s (%d out of %d)" %
                  (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count))

            # Get neighborhood from database
            neighborhood = json.loads(probe.network)

            # Get sequence ids from genes in first level neighborhood
            sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n]

            # Get own GO terms
            own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id)
            own_terms = list(set([a.go_id for a in own_associations]))

            # Get GO terms from neighbors
            associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\
                filter(SequenceGOAssociation.predicted == 0).all()

            # Make GO terms from neighbors unique and ignore terms the current gene has already
            unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms])
            go_counts = defaultdict(lambda: 0)

            for ua in unique_associations:
                go_counts[ua[1]] += 1

            # find significantly enriched GO terms and store them
            enriched_go = []

            for go_id, count in go_counts.items():
                p_value = hypergeo_sf(count, len(sequence_ids), go_background[go_id], len(probes))
                if p_value < cutoff:
                    enriched_go.append((go_id, p_value))

            # apply FDR correction to the p-values
            corrected_p = fdr_correction([a[1] for a in enriched_go])

            # push new prediction in a dict that will be added to the DB
            for corrected_p, (go_id, p_value) in zip(corrected_p, enriched_go):
                new_associations.append({
                    'sequence_id': probe.sequence_id,
                    'go_id': go_id,
                    'evidence': 'IEP',
                    'source': source,
                    'predicted': True,
                    'prediction_data': json.dumps({'p-cutoff': cutoff,
                                                   'p-value': p_value,
                                                   'p-value (FDR)': corrected_p,
                                                   'network_method': expression_network_method_id,
                                                   'prediction_method': 'Neighborhood enrichment'
                                                   })
                })

        # Add new labels to the database in chuncks of 400
        for i in range(0, len(new_associations), 400):
            db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])
Example #8
0
    def calculate_tissue_specificities(species_id,
                                       description,
                                       condition_to_tissue,
                                       order,
                                       remove_background=False,
                                       use_max=True):
        """
        Function calculates tissue specific genes based on the expression conditions. A dict is required to link
        specific conditions to the correct tissues. This also allows conditions to be excluded in case they are
        unrelated with a specific tissue.


        :param species_id: internal species ID
        :param description: description for the method to determine the specificity
        :param condition_to_tissue: dict to connect a condition to a tissue
        :param order: preferred order of the conditions, will match tissues to it
        :param remove_background: substracts the lowest value to correct for background noise
        :param use_max: uses the maximum of mean values instead of the mean of all values
        :return id of the new method
        """
        new_method = ExpressionSpecificityMethod()
        new_method.species_id = species_id
        new_method.description = description
        new_method.menu_order = 0
        tissues = []
        for c in order:
            if c in condition_to_tissue.keys():
                v = condition_to_tissue[c]
                if v not in tissues:
                    tissues.append(v)

        # get profile from the database (ORM free for speed)
        profiles = db.engine.execute(
            db.select([
                ExpressionProfile.__table__.c.id,
                ExpressionProfile.__table__.c.profile
            ]).where(ExpressionProfile.__table__.c.species_id ==
                     species_id)).fetchall()

        new_method.conditions = json.dumps(tissues)

        db.session.add(new_method)
        db.session.commit()

        # detect specifities and add to the database
        specificities = []

        for profile_id, profile in profiles:
            # prepare profile data for calculation
            profile_data = json.loads(profile)
            profile_means = {}
            for t in tissues:
                values = []
                means = []
                valid_conditions = [
                    k for k in profile_data['data']
                    if k in condition_to_tissue and condition_to_tissue[k] == t
                ]
                for k, v in profile_data['data'].items():
                    if k in valid_conditions:
                        values += v
                        means.append(mean(v))

                if not use_max:
                    profile_means[t] = mean(values) if len(values) > 0 else 0
                else:
                    profile_means[t] = max(means) if len(means) > 0 else 0

            # substract minimum value to remove background
            # experimental code !
            if remove_background:
                minimum = min([v for k, v in profile_means.items()])

                for k in profile_means.keys():
                    profile_means[k] -= minimum

            # determine spm score for each condition
            profile_specificities = []
            profile_tau = tau([v for _, v in profile_means.items()])
            profile_entropy = entropy_from_values(
                [v for _, v in profile_means.items()])

            for t in tissues:
                score = expression_specificity(t, profile_means)
                new_specificity = {
                    'profile_id': profile_id,
                    'condition': t,
                    'score': score,
                    'entropy': profile_entropy,
                    'tau': profile_tau,
                    'method_id': new_method.id,
                }

                profile_specificities.append(new_specificity)

            # sort conditions and add top one
            profile_specificities = sorted(profile_specificities,
                                           key=lambda x: x['score'],
                                           reverse=True)

            specificities.append(profile_specificities[0])

            # write specificities to db if there are more than 400 (ORM free for speed)
            if len(specificities) > 400:
                db.engine.execute(ExpressionSpecificity.__table__.insert(),
                                  specificities)
                specificities = []

        # write remaining specificities to the db
        db.engine.execute(ExpressionSpecificity.__table__.insert(),
                          specificities)
        return new_method.id
Example #9
0
    def calculate_ecc(network_method_ids, gene_family_method_id, max_size=100):
        """
        Function to calculate the ECC scores in and between genes of different networks

        ORM free method for speed !

        :param network_method_ids: array of networks (using their internal id !) to compare
        :param gene_family_method_id: internal id of the type of family methods to be used for the comparison
        """

        network_families = {}
        sequence_network = {}
        sequence_network_method = {}
        sequence_family = {}
        family_sequence = {}

        # Get all the network information and store in dictionary
        for n in network_method_ids:
            current_network = db.engine.execute(db.select([ExpressionNetwork.__table__.c.sequence_id,
                                                           ExpressionNetwork.__table__.c.network,
                                                           ExpressionNetwork.__table__.c.method_id]).
                                                where(ExpressionNetwork.__table__.c.method_id == n).
                                                where(ExpressionNetwork.__table__.c.sequence_id.isnot(None))
                                                ).fetchall()

            for sequence, network, network_method_id in current_network:
                if sequence is not None:
                    sequence_network[int(sequence)] = network
                    sequence_network_method[int(sequence)] = int(network_method_id)

        # Get family data and store in dictionary
        current_families = db.engine.execute(db.select([SequenceFamilyAssociation.__table__.c.sequence_id,
                                                        SequenceFamilyAssociation.__table__.c.gene_family_id,
                                                        GeneFamily.__table__.c.method_id]).
                                             select_from(SequenceFamilyAssociation.__table__.join(GeneFamily.__table__)).
                                             where(GeneFamily.__table__.c.method_id == gene_family_method_id)
                                             ).fetchall()

        for sequence, family, method in current_families:
            sequence_family[int(sequence)] = int(family)

            if family not in family_sequence.keys():
                family_sequence[int(family)] = []

            family_sequence[int(family)].append(int(sequence))

        # Create a dict (key = network) with the families present in that network
        # Families that occur multiple times should be present multiple times as this is used
        # to set threshholds later !

        for sequence, network_method in sequence_network_method.items():
            # ignore sequences without a family, ideally this shouldn't happen
            if network_method not in network_families.keys():
                network_families[network_method] = []

            if sequence in sequence_family.keys():
                family = sequence_family[sequence]
                network_families[network_method].append(family)

        # Determine threshold and p-value
        # A background model will be computed for each combination of networks, an ECC score will need to be better
        # than 95 % of the randomly found values to be considered significant

        thresholds = {}
        print("Starting permutation tests")
        for n in network_method_ids:
            thresholds[n] = {}
            for m in network_method_ids:
                thresholds[n][m] = ExpressionNetworkMethod.__set_thresholds(network_families[n],
                                                                            network_families[m],
                                                                            max_size=max_size)

        # Data loaded start calculating ECCs
        new_ecc_scores = []

        for family, sequences in family_sequence.items():
            for i in range(len(sequences) - 1):
                query = sequences[i]
                for j in range(i+1, len(sequences)):
                    target = sequences[j]
                    if query in sequence_network.keys() and target in sequence_network.keys() and query != target:
                        # Ignore genes with overlapping neighborhoods
                        if not ExpressionNetworkMethod.__neighborhoods_overlap(sequence_network[query], sequence_network[target]):
                            ecc, significant = ExpressionNetworkMethod.__ecc(sequence_network[query],
                                                                             sequence_network[target],
                                                                             sequence_family,
                                                                             thresholds[sequence_network_method[query]][sequence_network_method[target]],
                                                                             family,
                                                                             max_size=max_size)
                            if significant:
                                new_ecc_scores.append({
                                    'query_id': query,
                                    'target_id': target,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[query],
                                    'target_network_method_id': sequence_network_method[target],
                                })

                                # add reciprocal relation
                                new_ecc_scores.append({
                                    'query_id': target,
                                    'target_id': query,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[target],
                                    'target_network_method_id': sequence_network_method[query],
                                })
                                if len(new_ecc_scores) > 400:
                                    db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)
                                    new_ecc_scores = []

        db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)