Exemple #1
0
class ClusterCladeEnrichment(db.Model):
    __tablename__ = 'cluster_clade_enrichment'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    cluster_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))
    clade_id = db.Column(db.Integer,
                         db.ForeignKey('clades.id', ondelete='CASCADE'))

    gene_family_method_id = db.Column(
        db.Integer, db.ForeignKey('gene_family_methods.id',
                                  ondelete='CASCADE'))

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         backref=db.backref(
                                             'clade_enrichment',
                                             lazy='dynamic',
                                             passive_deletes=True),
                                         lazy='joined')

    cluster = db.relationship('CoexpressionCluster',
                              backref=db.backref('clade_enrichment',
                                                 lazy='dynamic',
                                                 passive_deletes=True),
                              lazy='joined')

    clade = db.relationship('Clade',
                            backref=db.backref('enriched_clusters',
                                               lazy='dynamic',
                                               passive_deletes=True),
                            lazy='joined')
    """
    Counts required to calculate the enrichment,
    store here for quick access
    """
    cluster_count = db.Column(db.Integer)
    cluster_size = db.Column(db.Integer)
    clade_count = db.Column(db.Integer)
    clade_size = db.Column(db.Integer)
    """
    Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric
    distribution and applying FDR correction (aka. BH)
    """
    enrichment = db.Column(db.Float)
    p_value = db.Column(db.Float)
    corrected_p_value = db.Column(db.Float)

    @property
    def cluster_percentage(self):
        return self.cluster_count * 100 / self.cluster_size

    @property
    def genome_percentage(self):
        return self.clade_count * 100 / self.clade_size
Exemple #2
0
class CoexpressionClusterSimilarity(db.Model):
    __tablename__ = 'coexpression_cluster_similarity'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    source_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))
    target_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))

    gene_family_method_id = db.Column('gene_family_method_id',
                                      db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    jaccard_index = db.Column(db.Float, index=True)
    p_value = db.Column(db.Float, index=True)
    corrected_p_value = db.Column(db.Float, index=True)

    source = db.relationship('CoexpressionCluster',
                             backref=db.backref('similarity_sources',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined',
                             foreign_keys=[source_id])

    target = db.relationship('CoexpressionCluster',
                             backref=db.backref('similarity_targets',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined',
                             foreign_keys=[target_id])

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         backref=db.backref(
                                             'CoexpressionClusterSimilarities',
                                             passive_deletes=True),
                                         lazy='joined')

    @staticmethod
    def empty_table():
        """
        Delete all content from this table. Use carefully !
        """
        CoexpressionClusterSimilarity.query.delete()
Exemple #3
0
class SequenceSequenceCladeAssociation(db.Model):
    __tablename__ = 'sequence_sequence_clade'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)

    sequence_one_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'))
    sequence_two_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'))

    clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='CASCADE'), index=True)
    tree_id = db.Column(db.Integer, db.ForeignKey('trees.id', ondelete='CASCADE'), index=True)

    duplication = db.Column(db.SmallInteger)
    duplication_consistency_score = db.Column(db.Float)

    tree = db.relationship('Tree', lazy='joined',
                           backref=db.backref('sequence_sequence_clade_associations',
                                              lazy='dynamic',
                                              passive_deletes=True)
                           )

    clade = db.relationship('Clade', lazy='joined',
                            backref=db.backref('sequence_sequence_clade_associations',
                                               lazy='dynamic',
                                               passive_deletes=True)
                            )

    def __str__(self):
        return "%d" % self.id

    @property
    def readable_type(self):
        """
        Returns type (duplication or speciation) in a human-readable format

        :return: string Duplication or Speciation
        """
        return "Duplication" if self.duplication else "Speciation"

    @property
    def readable_score(self):
        """
        Returns the duplication consistency score in a nicer format

        :return: string with dup. consistency score in .%3 - format. Or "Not available" for speciations.
        """
        return "%.3f" % self.duplication_consistency_score if self.duplication else "Not available"
Exemple #4
0
class SequenceFamilyAssociation(db.Model):
    __tablename__ = 'sequence_family'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    gene_family_id = db.Column(
        db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'))

    sequence = db.relationship('Sequence',
                               backref=db.backref('family_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')
    family = db.relationship('GeneFamily',
                             backref=db.backref('sequence_associations',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined')
Exemple #5
0
class FamilyGOAssociation(db.Model):
    __tablename__ = 'family_go'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    gene_family_id = db.Column(
        db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'))
    go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE'))

    gene_family = db.relationship('GeneFamily',
                                  backref=db.backref('go_annotations',
                                                     lazy='dynamic',
                                                     passive_deletes=True),
                                  lazy='joined')

    go_term = db.relationship('GO',
                              backref=db.backref('family_associations',
                                                 lazy='dynamic',
                                                 passive_deletes=True),
                              lazy='joined')
Exemple #6
0
class SequenceInterproAssociation(db.Model):
    __tablename__ = 'sequence_interpro'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    interpro_id = db.Column(db.Integer,
                            db.ForeignKey('interpro.id', ondelete='CASCADE'))
    start = db.Column(db.Integer, default=None)
    stop = db.Column(db.Integer, default=None)

    sequence = db.relationship('Sequence',
                               backref=db.backref('interpro_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')

    domain = db.relationship('Interpro',
                             backref=db.backref('sequence_associations',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined')
Exemple #7
0
class SequenceCoexpressionClusterAssociation(db.Model):
    __tablename__ = 'sequence_coexpression_cluster'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    probe = db.Column(db.String(50), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    coexpression_cluster_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))

    sequence = db.relationship('Sequence',
                               backref=db.backref(
                                   'coexpression_cluster_associations',
                                   lazy='dynamic',
                                   passive_deletes=True),
                               lazy='joined')
    coexpression_cluster = db.relationship('CoexpressionCluster',
                                           backref=db.backref(
                                               'sequence_associations',
                                               lazy='dynamic',
                                               passive_deletes=True),
                                           lazy='joined')
Exemple #8
0
class CoexpressionClusteringMethod(db.Model):
    __tablename__ = 'coexpression_clustering_methods'
    id = db.Column(db.Integer, primary_key=True)
    network_method_id = db.Column(db.Integer,
                                  db.ForeignKey(
                                      'expression_network_methods.id',
                                      ondelete='CASCADE'),
                                  index=True)
    method = db.Column(db.Text)
    cluster_count = db.Column(db.Integer)

    clusters = db.relationship('CoexpressionCluster',
                               backref=db.backref('method', lazy='joined'),
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)

    @staticmethod
    def update_counts():
        """
        To avoid long counts the number of clusters per method can be precalculated and stored in the database
        using this function
        """
        methods = CoexpressionClusteringMethod.query.all()

        for m in methods:
            m.cluster_count = m.clusters.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def clusters_from_neighborhoods(method, network_method_id):
        probes = ExpressionNetwork.query.filter_by(
            method_id=network_method_id).all()  # Load all probes

        clusters = defaultdict(list)
        clusters_orm = {}

        sequence_to_probe = {}

        for p in probes:
            # Only consider probes linked with sequences
            if p.sequence_id is not None:
                sequence_to_probe[p.sequence_id] = p.probe
                neighborhood = json.loads(p.network)
                sequence_ids = [
                    n["gene_id"] for n in neighborhood
                    if "gene_id" in n.keys() and n["gene_id"] is not None
                ]

                # check if there are neighbors for this sequence
                if len(sequence_ids) > 0:
                    clusters[p.sequence.name] = [p.sequence_id] + sequence_ids

        # If there are valid clusters add them to the database
        if len(clusters) > 0:

            # Add new method first
            new_method = CoexpressionClusteringMethod()

            new_method.network_method_id = network_method_id
            new_method.method = method
            new_method.cluster_count = len(clusters)

            db.session.add(new_method)

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        # Add Clusters
        for cluster in clusters.keys():
            clusters_orm[cluster] = CoexpressionCluster()
            clusters_orm[cluster].method_id = new_method.id
            clusters_orm[cluster].name = cluster
            db.session.add(clusters_orm[cluster])

            if len(clusters_orm) % 400 == 0:
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

        # Add sequence cluster relations
        for i, (cluster, members) in enumerate(clusters.items()):
            for sequence_id in members:
                relation = SequenceCoexpressionClusterAssociation()
                relation.sequence_id = sequence_id
                relation.coexpression_cluster_id = clusters_orm[cluster].id
                relation.probe = sequence_to_probe[
                    sequence_id] if sequence_id in sequence_to_probe.keys(
                    ) else None

                db.session.add(relation)

            if i % 20 == 0:
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def build_hcca_clusters(method,
                            network_method_id,
                            step_size=3,
                            hrr_cutoff=30,
                            min_cluster_size=40,
                            max_cluster_size=200):
        """
        method to build HCCA clusters for a certain network

        :param method: Name for the current clustering method
        :param network_method_id: ID for the network to cluster
        :param step_size: desired step_size for the HCCA algorithm
        :param hrr_cutoff: desired hrr_cutoff for the HCCA algorithm
        :param min_cluster_size: minimal cluster size
        :param max_cluster_size: maximum cluster size
        """

        network_data = {}

        sequence_probe = {}

        # Get network from DB
        print("Loading Network data from DB...", sep='')
        ExpressionNetworkMethod.query.get_or_404(
            network_method_id)  # Check if method exists

        probes = ExpressionNetwork.query.filter_by(
            method_id=network_method_id).all()  # Load all probes

        for p in probes:
            # Loop over probes and store hrr for all neighbors
            if p.sequence_id is not None:
                neighborhood = json.loads(p.network)
                network_data[p.sequence_id] = {
                    nb["gene_id"]: nb["hrr"]
                    for nb in neighborhood if "gene_id" in nb.keys()
                    and "hrr" in nb.keys() and nb["gene_id"] is not None
                }

                sequence_probe[p.sequence_id] = p.probe

        # Double check edges are reciprocally defined
        for sequence, data in network_data.items():
            for neighbor, score in data.items():
                if neighbor not in network_data.keys():
                    network_data[neighbor] = {sequence: score}
                else:
                    if sequence not in network_data[neighbor].keys():
                        network_data[neighbor][sequence] = score

        print("Done!\nStarting to build Clusters...\n")

        # Build clusters
        hcca_util = HCCA(step_size=step_size,
                         hrr_cutoff=hrr_cutoff,
                         min_cluster_size=min_cluster_size,
                         max_cluster_size=max_cluster_size)

        hcca_util.load_data(network_data)

        hcca_util.build_clusters()

        # Add new method to DB
        clusters = list(set([t[1] for t in hcca_util.clusters]))
        if len(clusters) > 0:
            print("Done building clusters, adding clusters to DB")

            # Add new method first
            new_method = CoexpressionClusteringMethod()

            new_method.network_method_id = network_method_id
            new_method.method = method
            new_method.cluster_count = len(clusters)

            db.session.add(new_method)

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

            # Add cluster and store as dict
            cluster_dict = {}

            for c in clusters:
                cluster_dict[c] = CoexpressionCluster()
                cluster_dict[c].method_id = new_method.id
                cluster_dict[c].name = c

                db.session.add(cluster_dict[c])

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

            # Link sequences to clusters
            for i, t in enumerate(hcca_util.clusters):
                gene_id, cluster_name, _ = t

                relation = SequenceCoexpressionClusterAssociation()

                relation.probe = sequence_probe[
                    gene_id] if gene_id in sequence_probe.keys() else None
                relation.sequence_id = gene_id
                relation.coexpression_cluster_id = cluster_dict[
                    cluster_name].id if cluster_name in cluster_dict.keys(
                    ) else None

                if relation.coexpression_cluster_id is not None:
                    db.session.add(relation)

                if i > 0 and i % 400 == 0:
                    # Add relations in sets of 400
                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)

            # Add remaining relations
            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        else:
            print("No clusters found! Not adding anything to DB !")

    @staticmethod
    def add_lstrap_coexpression_clusters(cluster_file,
                                         description,
                                         network_id,
                                         prefix='cluster_',
                                         min_size=10):
        """
        Adds MCL clusters, as produced by LSTrAP, to the database

        :param cluster_file: path to file with clusters
        :param description: description to add to database for this set of clusters
        :param network_id: network the clusters are based on
        :param prefix: prefix for individual clsuter names (default 'cluster_')
        :param min_size: minimal size of a cluster (default = 10)
        :return: ID of new clustering method
        """
        # get all sequences from the database and create a dictionary
        sequences = Sequence.query.all()

        sequence_dict = {}
        for member in sequences:
            sequence_dict[member.name.upper()] = member

        # add coexpression clustering method to the database
        clustering_method = CoexpressionClusteringMethod()

        clustering_method.network_method_id = network_id
        clustering_method.method = description

        try:
            db.session.add(clustering_method)
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)
            quit()

        with open(cluster_file) as f:
            i = 1
            for line in f:
                probes = [p for p in line.strip().split()]
                genes = [p.replace('.1', '') for p in probes]
                cluster_id = "%s%04d" % (prefix, i)

                if len(probes) >= min_size:
                    i += 1

                    new_cluster = CoexpressionCluster()
                    new_cluster.method_id = clustering_method.id
                    new_cluster.name = cluster_id

                    db.session.add(new_cluster)

                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)
                        continue

                    for p, g in zip(probes, genes):
                        new_association = SequenceCoexpressionClusterAssociation(
                        )
                        new_association.probe = p
                        new_association.sequence_id = None
                        if g.upper() in sequence_dict.keys():
                            new_association.sequence_id = sequence_dict[
                                g.upper()].id
                        new_association.coexpression_cluster_id = new_cluster.id
                        db.session.add(new_association)
                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)

        return clustering_method.id
Exemple #9
0
class TreeMethod(db.Model):
    __tablename__ = 'tree_methods'
    id = db.Column(db.Integer, primary_key=True)

    description = db.Column(db.Text)

    gene_family_method_id = db.Column(db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    trees = db.relationship('Tree',
                            backref=db.backref('method', lazy='joined'),
                            lazy='dynamic',
                            passive_deletes=True)

    def reconcile_trees(self):
        print("\n1.====================Getting into function reconcile_trees")
        # Fetch required data from the database
        sequences = Sequence.query.all()
        #print("\n1.1.=============================Sequences Joined: " + ', '.join(sequences)) #FAILS, bad print statement for list obj
        clades = Clade.query.all()
        #print("\n1.2. =========================Clades: ", *clades, sep='\n') # print works

        seq_to_species = {s.name: s.species.code for s in sequences}
        #print("\n2.=========================seq_to_species: ", *seq_to_species, sep='::')
        seq_to_id = {s.name: s.id for s in sequences}
        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        new_associations = []

        phyloxml_data = {}

        for t in self.trees:
            # Load tree from Newick string and start reconciliating
            tree = newick.loads(t.data_newick)[0]
            print("\n3.=========================tree loaded ok")

            for node in tree.walk():
                if len(node.descendants) != 2:
                    #print("\n4.==========length of node descendant=" + str(len(node.descendants)))
                    if not node.is_binary:
                        print("\n5.================Non-Binary-node: " +
                              str(node.is_binary))
                        # Print warning in case there is a non-binary node
                        #sdash: commenting out this original print statement because none binary-node doesn't have id nor label. Process stops at this print statement for non-binary trees.

                        print(
                            "Non-Binary tree: " + t.data_newick
                        )  #sdash: this print statement will show which tree is non-binary and is skipped. Doesn't stop the reconcile process.
                        #sdash May-03-2019#original#
                        #print("[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label))
                    # Otherwise it is a leaf node and can be skipped
                    continue

                branch_one_seq = [
                    l.name.strip() for l in node.descendants[0].get_leaves()
                ]
                # print("\n6.===============Branch-one-seq: " + ', '.join(branch_one_seq))
                branch_two_seq = [
                    l.name.strip() for l in node.descendants[1].get_leaves()
                ]
                # print("\n7.===============Branch-two-seq: " + ', '.join(branch_two_seq))

                branch_one_species = set([
                    seq_to_species[s] for s in branch_one_seq
                    if s in seq_to_species.keys()
                ])
                print(
                    "\n8.===============Branch-one-spp: " +
                    ', '.join(branch_one_species)
                )  #Empty set, length=0; seq_to_species length=143271; SO, problem in forming this set definition
                ## TO DO:
                #Possibly the seq name seq_to_species doesn't match in branch_one_seq and
                #  hence, it is an empty set.  Next check this possibility. Tue June 25.

                branch_two_species = set([
                    seq_to_species[s] for s in branch_two_seq
                    if s in seq_to_species.keys()
                ])
                print("\n9.===============Branch-two-spp: " +
                      ', '.join(branch_two_species))

                all_species = branch_one_species.union(branch_two_species)

                clade, _ = phylo.get_clade(all_species, clade_to_species)
                duplication = phylo.is_duplication(branch_one_species,
                                                   branch_two_species,
                                                   clade_to_species)

                duplication_consistency = None
                if duplication:
                    duplication_consistency = phylo.duplication_consistency(
                        branch_one_species, branch_two_species)

                tags = [
                    clade_to_id[clade] if clade is not None else 0,
                    'D' if duplication else 'S',
                    duplication_consistency if duplication else 0
                ]

                node.name = '_'.join([str(t) for t in tags])

                if clade is not None:
                    for seq_one in branch_one_seq:
                        for seq_two in branch_two_seq:
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_one],
                                'sequence_two_id':
                                seq_to_id[seq_two],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_two],
                                'sequence_two_id':
                                seq_to_id[seq_one],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })

            if len(new_associations) > 400:
                db.engine.execute(
                    SequenceSequenceCladeAssociation.__table__.insert(),
                    new_associations)
                new_associations = []

            # add newick tree to memory
            phyloxml_data[t.id] = newick.dumps([tree])

        db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(),
                          new_associations)

        # Update PhyloXML data file for all trees
        for t in self.trees:
            if t.id in phyloxml_data.keys():
                t.data_phyloxml = phyloxml_data[t.id]

        db.session.commit()
Exemple #10
0
class TreeMethod(db.Model):
    __tablename__ = 'tree_methods'
    id = db.Column(db.Integer, primary_key=True)

    description = db.Column(db.Text)

    gene_family_method_id = db.Column(db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    trees = db.relationship('Tree',
                            backref=db.backref('method', lazy='joined'),
                            lazy='dynamic',
                            passive_deletes=True)

    def reconcile_trees(self):
        # Fetch required data from the database
        sequences = Sequence.query.all()
        clades = Clade.query.all()

        seq_to_species = {s.name: s.species.code for s in sequences}
        seq_to_id = {s.name: s.id for s in sequences}
        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        new_associations = []

        phyloxml_data = {}

        for t in self.trees:
            # Load tree from Newick string and start reconciliating
            tree = newick.loads(t.data_newick)[0]

            for node in tree.walk():
                if len(node.descendants) != 2:
                    if not node.is_binary:
                        # Print warning in case there is a non-binary node
                        print(
                            "[%d, %s] Skipping node... Can only reconcile binary nodes ..."
                            % (tree.id, tree.label))
                    # Otherwise it is a leaf node and can be skipped
                    continue

                branch_one_seq = [
                    l.name.strip() for l in node.descendants[0].get_leaves()
                ]
                branch_two_seq = [
                    l.name.strip() for l in node.descendants[1].get_leaves()
                ]

                branch_one_species = set([
                    seq_to_species[s] for s in branch_one_seq
                    if s in seq_to_species.keys()
                ])
                branch_two_species = set([
                    seq_to_species[s] for s in branch_two_seq
                    if s in seq_to_species.keys()
                ])

                all_species = branch_one_species.union(branch_two_species)

                clade, _ = phylo.get_clade(all_species, clade_to_species)
                duplication = phylo.is_duplication(branch_one_species,
                                                   branch_two_species,
                                                   clade_to_species)

                duplication_consistency = None
                if duplication:
                    duplication_consistency = phylo.duplication_consistency(
                        branch_one_species, branch_two_species)

                tags = [
                    clade_to_id[clade] if clade is not None else 0,
                    'D' if duplication else 'S',
                    duplication_consistency if duplication else 0
                ]

                node.name = '_'.join([str(t) for t in tags])

                if clade is not None:
                    for seq_one in branch_one_seq:
                        for seq_two in branch_two_seq:
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_one],
                                'sequence_two_id':
                                seq_to_id[seq_two],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_two],
                                'sequence_two_id':
                                seq_to_id[seq_one],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })

            if len(new_associations) > 400:
                db.engine.execute(
                    SequenceSequenceCladeAssociation.__table__.insert(),
                    new_associations)
                new_associations = []

            # add newick tree to memory
            phyloxml_data[t.id] = newick.dumps([tree])

        db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(),
                          new_associations)

        # Update PhyloXML data file for all trees
        for t in self.trees:
            if t.id in phyloxml_data.keys():
                t.data_phyloxml = phyloxml_data[t.id]

        db.session.commit()
Exemple #11
0
class ExpressionProfile(db.Model):
    __tablename__ = 'expression_profiles'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'),
                            index=True)
    profile = db.deferred(db.Column(db.Text))

    specificities = db.relationship('ExpressionSpecificity',
                                    backref=db.backref('profile',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    def __init__(self, probe, sequence_id, profile):
        self.probe = probe
        self.sequence_id = sequence_id
        self.profile = profile

    @staticmethod
    def __profile_to_table(data):
        """
        Internal function to convert an expression profile (dict) to a tabular text

        :param data: Dict with expression profile
        :return: table (string)
        """
        output = [["condition", "mean", "min", "max"]]
        order = data["order"]

        for o in order:
            try:
                values = data["data"][o]
                output.append(
                    [o,
                     str(mean(values)),
                     str(min(values)),
                     str(max(values))])
            except Exception as e:
                print(e)

        return '\n'.join(['\t'.join(l) for l in output])

    @property
    def table(self):
        """
        Returns the condition expression as a tabular text file

        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(json.loads(self.profile))

        return table

    def tissue_table(self, condition_tissue_id, use_means=True):
        """
        Returns the tissue expression as a tabular text file

        :param condition_tissue_id: condition_tissue_id for the conversion
        :param use_means: Use the mean of the condition (recommended)
        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(
            self.tissue_profile(condition_tissue_id, use_means=use_means))
        return table

    @property
    def low_abundance(self, cutoff=10):
        """
        Checks if the mean expression value in any conditions in the plot is higher than the desired cutoff

        :param cutoff: cutoff for expression, default = 10
        :return: True in case of low abundance otherwise False
        """
        data = json.loads(self.profile)

        checks = [mean(v) > cutoff for _, v in data["data"].items()]

        return not any(checks)

    @staticmethod
    def convert_profile(condition_to_tissue, profile_data, use_means=True):
        """
        Convert a full, detailed profile into a more general summarized one using conversion table stored in the
        database

        :param condition_to_tissue: dict with conversion instructions
        :param profile_data: profile to convert
        :param use_means: use means of detailed condition if True otherwise use samples independently. Default True
        :return: New profile
        """
        tissues = list(set(condition_to_tissue['conversion'].values()))

        output = {}

        for t in tissues:
            valid_conditions = [
                k for k in profile_data['data']
                if k in condition_to_tissue['conversion']
                and condition_to_tissue['conversion'][k] == t
            ]
            valid_values = []
            for k, v in profile_data['data'].items():
                if k in valid_conditions:
                    if use_means:
                        valid_values.append(mean(v))
                    else:
                        valid_values += v

            output[t] = valid_values if len(valid_values) > 0 else [0]

        return {
            'order': condition_to_tissue['order'],
            'colors': condition_to_tissue['colors'],
            'data': output
        }

    def tissue_profile(self, condition_tissue_id, use_means=True):
        """
        Applies a conversion to the profile, grouping several condition into one more general feature (e.g. tissue).

        :param condition_tissue_id: identifier of the conversion table
        :param use_means: store the mean of the condition rather than individual values. The matches the spm
        calculations better.
        :return: parsed profile
        """
        ct = ConditionTissue.query.get(condition_tissue_id)

        condition_to_tissue = json.loads(ct.data)
        profile_data = json.loads(self.profile)

        output = ExpressionProfile.convert_profile(condition_to_tissue,
                                                   profile_data,
                                                   use_means=use_means)

        return output

    @staticmethod
    def get_heatmap(species_id, probes, zlog=True, raw=False):
        """
        Returns a heatmap for a given species (species_id) and a list of probes. It returns a dict with 'order'
        the order of the experiments and 'heatmap' another dict with the actual data. Data is zlog transformed

        :param species_id: species id (internal database id)
        :param probes: a list of probes to include in the heatmap
        :param zlog: enable zlog transformation (otherwise normalization against highest expressed condition)
        """
        profiles = ExpressionProfile.query.options(undefer('profile')).filter_by(species_id=species_id).\
            filter(ExpressionProfile.probe.in_(probes)).all()

        order = []

        output = []

        not_found = [p.lower() for p in probes]

        for profile in profiles:
            name = profile.probe
            data = json.loads(profile.profile)
            order = data['order']
            experiments = data['data']

            with contextlib.suppress(ValueError):
                not_found.remove(profile.probe.lower())

            with contextlib.suppress(ValueError):
                not_found.remove(profile.sequence.name.lower())

            values = {}

            for o in order:
                values[o] = mean(experiments[o])

            row_mean = mean(values.values())
            row_max = max(values.values())

            for o in order:
                if zlog:
                    if row_mean == 0 or values[o] == 0:
                        values[o] = '-'
                    else:
                        try:
                            values[o] = log(values[o] / row_mean, 2)
                        except ValueError as _:
                            print("Unable to calculate log()", values[o],
                                  row_mean)
                            values[o] = '-'
                else:
                    if row_max != 0 and not raw:
                        values[o] = values[o] / row_max

            output.append({
                "name": name,
                "values": values,
                "sequence_id": profile.sequence_id,
                "shortest_alias": profile.sequence.shortest_alias
            })

        if len(not_found) > 0:
            flash("Couldn't find profile for: %s" % ", ".join(not_found),
                  "warning")

        return {'order': order, 'heatmap_data': output}

    @staticmethod
    def get_profiles(species_id, probes, limit=1000):
        """
        Gets the data for a set of probes (including the full profiles), a limit can be provided to avoid overly
        long queries

        :param species_id: internal id of the species
        :param probes: probe names to fetch
        :param limit: maximum number of probes to get
        :return: List of ExpressionProfile objects including the full profiles
        """
        profiles = ExpressionProfile.query.\
            options(undefer('profile')).\
            filter(ExpressionProfile.probe.in_(probes)).\
            filter_by(species_id=species_id).\
            options(joinedload('sequence').load_only('name').noload('xrefs')).\
            limit(limit).all()

        return profiles

    @staticmethod
    def add_profile_from_lstrap(matrix_file,
                                annotation_file,
                                species_id,
                                order_color_file=None):
        """
        Function to convert an (normalized) expression matrix (lstrap output) into a profile

        :param matrix_file: path to the expression matrix
        :param annotation_file: path to the file assigning samples to conditions
        :param species_id: internal id of the species
        :param order_color_file: tab delimited file that contains the order and color of conditions
        """
        annotation = {}

        with open(annotation_file, 'r') as fin:
            # get rid of the header
            _ = fin.readline()

            for line in fin:
                parts = line.strip().split('\t')
                if len(parts) > 1:
                    run, description = parts
                    annotation[run] = description

        order, colors = [], []
        if order_color_file is not None:
            with open(order_color_file, 'r') as fin:
                for line in fin:
                    try:
                        o, c = line.strip().split('\t')
                        order.append(o)
                        colors.append(c)
                    except Exception as _:
                        pass

        # build conversion table for sequences
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        sequence_dict = {}  # key = sequence name uppercase, value internal id
        for s in sequences:
            sequence_dict[s.name.upper()] = s.id

        with open(matrix_file) as fin:
            # read header
            _, *colnames = fin.readline().rstrip().split()

            colnames = [c.replace('.htseq', '') for c in colnames]

            # determine order after annotation is not defined
            if order is None:
                order = []

                for c in colnames:
                    if c in annotation.keys():
                        if annotation[c] not in order:
                            order.append(annotation[c])

                order.sort()

            # read each line and build profile
            new_probes = []
            for line in fin:
                transcript, *values = line.rstrip().split()
                profile = defaultdict(list)

                for c, v in zip(colnames, values):
                    if c in annotation.keys():
                        condition = annotation[c]
                        profile[condition].append(float(v))

                new_probe = {
                    "species_id":
                    species_id,
                    "probe":
                    transcript,
                    "sequence_id":
                    sequence_dict[transcript.upper()]
                    if transcript.upper() in sequence_dict.keys() else None,
                    "profile":
                    json.dumps({
                        "order": order,
                        "colors": colors,
                        "data": profile
                    })
                }

                new_probes.append(new_probe)

                if len(new_probes) > 400:
                    db.engine.execute(ExpressionProfile.__table__.insert(),
                                      new_probes)
                    new_probes = []

            db.engine.execute(ExpressionProfile.__table__.insert(), new_probes)
Exemple #12
0
class SequenceSequenceECCAssociation(db.Model):
    __tablename__ = 'sequence_sequence_ecc'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)

    query_id = db.Column(db.Integer,
                         db.ForeignKey('sequences.id', ondelete='CASCADE'))
    target_id = db.Column(db.Integer,
                          db.ForeignKey('sequences.id', ondelete='CASCADE'))

    ecc = db.Column(db.Float)
    p_value = db.Column(db.Float)
    corrected_p_value = db.Column(db.Float)

    gene_family_method_id = db.Column(
        db.Integer, db.ForeignKey('gene_family_methods.id',
                                  ondelete='CASCADE'))
    query_network_method_id = db.Column(
        db.Integer,
        db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'))
    target_network_method_id = db.Column(
        db.Integer,
        db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'))

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         lazy='joined',
                                         backref=db.backref(
                                             'ecc_as_family_method',
                                             lazy='dynamic',
                                             passive_deletes=True))

    query_expression_network_method = db.relationship(
        'ExpressionNetworkMethod',
        foreign_keys=[query_network_method_id],
        lazy='joined',
        backref=db.backref('ecc_as_query_method',
                           lazy='dynamic',
                           passive_deletes=True))
    target_expression_network_method = db.relationship(
        'ExpressionNetworkMethod',
        foreign_keys=[target_network_method_id],
        lazy='joined',
        backref=db.backref('ecc_as_target_method',
                           lazy='dynamic',
                           passive_deletes=True))

    @staticmethod
    def get_ecc_network(sequence, network, family):
        """
        Get network connecting a specific sequence to all genes with significant Expression Context Conservation.


        :param sequence: internal ID of sequence
        :param network: network method ID to consider
        :param family: kind of gene families used to detect ECC
        :return: network dict (can be made compatible using CytoscapeHelper)
        """
        data = SequenceSequenceECCAssociation.query.filter(
            and_(
                SequenceSequenceECCAssociation.query_id == sequence,
                SequenceSequenceECCAssociation.query_network_method_id ==
                network, SequenceSequenceECCAssociation.gene_family_method_id
                == family)).all()

        # return an empty dict in case there are no hits for this query
        if len(data) < 1:
            return {'nodes': [], 'edges': []}

        # add the query node
        d = data[0]
        nodes = [{
            "id": d.query_sequence.name,
            "name": d.query_sequence.name,
            "species_id": d.query_sequence.species_id,
            "species_name": d.query_sequence.species.name,
            "gene_id": d.query_id,
            "gene_name": d.query_sequence.name,
            "network_method_id": network,
            "node_type": "query"
        }]
        edges = []

        networks = {}

        for d in data:
            nodes.append({
                "id": d.target_sequence.name,
                "name": d.target_sequence.name,
                "species_id": d.target_sequence.species_id,
                "species_name": d.target_sequence.species.name,
                "gene_id": d.target_id,
                "network_method_id": d.target_network_method_id,
                "gene_name": d.target_sequence.name
            })

            if d.target_network_method_id not in networks.keys():
                networks[d.target_network_method_id] = []
            networks[d.target_network_method_id].append(d.target_id)

            # TODO: add p-value and corrected p once implemented
            edges.append({
                "source": d.query_sequence.name,
                "target": d.target_sequence.name,
                "ecc_score": d.ecc,
                "edge_type": 0
            })

        for n, sequences in networks.items():
            new_data = SequenceSequenceECCAssociation.query.filter(
                and_(
                    SequenceSequenceECCAssociation.query_id.in_(sequences),
                    SequenceSequenceECCAssociation.target_id.in_(sequences),
                    SequenceSequenceECCAssociation.target_network_method_id ==
                    n, SequenceSequenceECCAssociation.query_network_method_id
                    == n, SequenceSequenceECCAssociation.gene_family_method_id
                    == family, SequenceSequenceECCAssociation.query_id !=
                    SequenceSequenceECCAssociation.target_id)).all()

            for nd in new_data:
                # TODO: add p-value and corrected p once implemented
                # make sure the connection doesn't exist already
                if not any(d['source'] == nd.target_sequence.name
                           and d['target'] == nd.query_sequence.name
                           for d in edges):
                    edges.append({
                        "source": nd.query_sequence.name,
                        "target": nd.target_sequence.name,
                        "ecc_score": nd.ecc,
                        "edge_type": 1
                    })

        return {"nodes": nodes, "edges": edges}

    @staticmethod
    def get_ecc_pair_network(ecc_id):
        """
        Get all data for an SequenceSequenceECCAssociation to make a ECC graph, similar to the pairwise comparisons in
        Movahedi et al.

        :param ecc_id: interal id of the SequenceSequenceECCAssociation
        :return: ecc pair with neighborhood as graph dict
        """

        association = SequenceSequenceECCAssociation.query.get_or_404(ecc_id)

        nodes = [
            {
                "id": association.query_sequence.name,
                "name": association.query_sequence.name,
                "species_id": association.query_sequence.species_id,
                "species_name": association.query_sequence.species.name,
                "gene_id": association.query_id,
                "gene_name": association.query_sequence.name,
                "network_method_id": association.query_network_method_id,
                "node_type": "query"
            },
            {
                "id": association.target_sequence.name,
                "name": association.target_sequence.name,
                "species_id": association.target_sequence.species_id,
                "species_name": association.target_sequence.species.name,
                "gene_id": association.target_id,
                "gene_name": association.target_sequence.name,
                "network_method_id": association.target_network_method_id,
                "node_type": "query"
            },
        ]

        edges = [{
            "source": association.query_sequence.name,
            "target": association.target_sequence.name,
            "ecc_score": association.ecc,
            'ecc_pair_color': "#D33",
            "edge_type": "ecc"
        }]

        query_network = association.query_sequence.network_nodes.filter_by(
            method_id=association.query_network_method_id).first_or_404(
            ).network
        target_network = association.target_sequence.network_nodes.filter_by(
            method_id=association.target_network_method_id).first_or_404(
            ).network

        query_network_data = json.loads(query_network)
        target_network_data = json.loads(target_network)

        sequences = [
            association.query_sequence.id, association.target_sequence.id
        ]

        for n in query_network_data:
            gene_id = n['gene_id'] if 'gene_id' in n.keys() else None
            gene_name = n['gene_name'] if 'gene_name' in n.keys() else None

            if gene_id not in sequences:
                nodes.append({
                    "id":
                    gene_name,
                    "name":
                    gene_name,
                    "species_id":
                    association.query_sequence.species_id,
                    "species_name":
                    association.query_sequence.species.name,
                    "gene_id":
                    gene_id,
                    "gene_name":
                    gene_name,
                    "network_method_id":
                    association.query_network_method_id,
                    "node_type":
                    "target"
                })
                sequences.append(gene_id)

            edges.append({
                "source":
                association.query_sequence.name,
                "target":
                gene_name,
                "link_score":
                n['link_score'] if 'link_score' in n else 0,
                "edge_type":
                "expression",
                'ecc_pair_color':
                "#3D3"
            })

        for n in target_network_data:
            gene_id = n['gene_id'] if 'gene_id' in n.keys() else None
            gene_name = n['gene_name'] if 'gene_name' in n.keys() else None

            if gene_id not in sequences:
                sequences.append(gene_id)
                nodes.append({
                    "id":
                    gene_name,
                    "name":
                    gene_name,
                    "species_id":
                    association.target_sequence.species_id,
                    "species_name":
                    association.target_sequence.species.name,
                    "gene_id":
                    gene_id,
                    "gene_name":
                    gene_name,
                    "network_method_id":
                    association.target_network_method_id,
                    "node_type":
                    "target"
                })

            edges.append({
                "source":
                association.target_sequence.name,
                "target":
                gene_name,
                "link_score":
                n['link_score'] if 'link_score' in n else 0,
                "edge_type":
                "expression",
                'ecc_pair_color':
                "#3D3"
            })

        return {
            "nodes": nodes,
            "edges": edges
        }, association.gene_family_method_id

    @staticmethod
    def get_ecc_multi_network(gf_method_id, sequence_ids):
        """
        Creates an ECC network for multiple genes, the resulting network will contain all ECC partners of the input
        genes. Pruning this network keeping only genes with non-unique label co-occurances is recommended !


        :param gf_method_id: gene family method used to detect ECC
        :param sequence_ids: sequences to include as the core of the network
        :return: network dict
        """
        associations = SequenceSequenceECCAssociation.query.\
            filter(SequenceSequenceECCAssociation.gene_family_method_id == gf_method_id).\
            filter(and_(SequenceSequenceECCAssociation.query_id.in_(sequence_ids),
                        SequenceSequenceECCAssociation.target_id.in_(sequence_ids))).\
            all()

        nodes, edges = [], []
        node_sequence_ids = []

        networks = []

        for a in associations:
            query_network = a.query_sequence.network_nodes.filter_by(
                method_id=a.query_network_method_id).first_or_404().network
            target_network = a.target_sequence.network_nodes.filter_by(
                method_id=a.target_network_method_id).first_or_404().network

            if query_network not in networks:
                networks.append((a.query_id, a.query_sequence.name,
                                 a.query_sequence.species_id,
                                 a.query_sequence.species.name,
                                 a.query_network_method_id, query_network))
            if target_network not in networks:
                networks.append((a.target_id, a.target_sequence.name,
                                 a.target_sequence.species_id,
                                 a.target_sequence.species.name,
                                 a.target_network_method_id, target_network))

            if a.query_id not in node_sequence_ids:
                node_sequence_ids.append(a.query_id)
                nodes.append({
                    "id": a.query_sequence.name,
                    "name": a.query_sequence.name,
                    "species_id": a.query_sequence.species_id,
                    "species_name": a.query_sequence.species.name,
                    "gene_id": a.query_id,
                    "gene_name": a.query_sequence.name,
                    "network_method_id": a.query_network_method_id,
                    "node_type": "query"
                })

            if a.target_id not in node_sequence_ids:
                node_sequence_ids.append(a.target_id)
                nodes.append({
                    "id": a.target_sequence.name,
                    "name": a.target_sequence.name,
                    "species_id": a.target_sequence.species_id,
                    "species_name": a.target_sequence.species.name,
                    "gene_id": a.target_id,
                    "gene_name": a.target_sequence.name,
                    "network_method_id": a.target_network_method_id,
                    "node_type": "query"
                })

            edges.append({
                "source": a.query_sequence.name,
                "target": a.target_sequence.name,
                "ecc_score": a.ecc,
                'ecc_pair_color': "#D33",
                "edge_type": "ecc"
            })

        new_edges = []

        for sequence_id, sequence_name, species_id, species_name, network_method_id, n in networks:
            network_data = json.loads(n)
            for node in network_data:
                gene_id = node['gene_id'] if 'gene_id' in node.keys() else None
                gene_name = node['gene_name'] if 'gene_name' in node.keys(
                ) else None

                if gene_id not in node_sequence_ids:
                    node_sequence_ids.append(gene_id)
                    nodes.append({
                        "id": gene_name,
                        "name": gene_name,
                        "species_id": species_id,
                        "species_name": species_name,
                        "gene_id": gene_id,
                        "gene_name": gene_name,
                        "network_method_id": network_method_id,
                        "node_type": "target"
                    })

                if (sequence_name, gene_name) not in new_edges:
                    new_edges.append((sequence_name, gene_name))
                    new_edges.append((gene_name, sequence_name))

                    edges.append({
                        "source":
                        sequence_name,
                        "target":
                        gene_name,
                        "link_score":
                        node['link_score'] if 'link_score' in node else 0,
                        "edge_type":
                        "expression",
                        'ecc_pair_color':
                        "#3D3"
                    })

        return {"nodes": nodes, "edges": edges}, gf_method_id
Exemple #13
0
class ExpressionNetworkMethod(db.Model):
    __tablename__ = 'expression_network_methods'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer, db.ForeignKey('species.id'), index=True)
    description = db.Column(db.Text)
    edge_type = db.Column(db.Enum("rank", "weight", name='edge_type'))
    probe_count = db.Column(db.Integer)

    hrr_cutoff = db.Column(db.Integer)
    pcc_cutoff = db.Column(db.Float)
    enable_second_level = db.Column(db.SmallInteger)

    probes = db.relationship('ExpressionNetwork',
                             backref=db.backref('method', lazy='joined'),
                             lazy='dynamic',
                             cascade="all, delete-orphan",
                             passive_deletes=True)

    clustering_methods = db.relationship('CoexpressionClusteringMethod',
                                         backref='network_method',
                                         lazy='dynamic',
                                         cascade='all, delete-orphan',
                                         passive_deletes=True)

    def __init__(self, species_id, description, edge_type="rank"):
        self.species_id = species_id
        self.description = description
        self.edge_type = edge_type
        self.enable_second_level = False

    def __repr__(self):
        return str(self.id) + ". " + self.description + ' [' + str(self.species) + ']'

    @staticmethod
    def update_count():
        """
        To avoid long count queries the number of networks for each method can be precalculated and stored in the
        database using this function
        """
        methods = ExpressionNetworkMethod.query.all()

        for m in methods:
            m.probe_count = m.probes.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    @benchmark
    def calculate_ecc(network_method_ids, gene_family_method_id, max_size=100):
        """
        Function to calculate the ECC scores in and between genes of different networks

        ORM free method for speed !

        :param network_method_ids: array of networks (using their internal id !) to compare
        :param gene_family_method_id: internal id of the type of family methods to be used for the comparison
        """

        network_families = {}
        sequence_network = {}
        sequence_network_method = {}
        sequence_family = {}
        family_sequence = {}

        # Get all the network information and store in dictionary
        for n in network_method_ids:
            current_network = db.engine.execute(db.select([ExpressionNetwork.__table__.c.sequence_id,
                                                           ExpressionNetwork.__table__.c.network,
                                                           ExpressionNetwork.__table__.c.method_id]).
                                                where(ExpressionNetwork.__table__.c.method_id == n).
                                                where(ExpressionNetwork.__table__.c.sequence_id.isnot(None))
                                                ).fetchall()

            for sequence, network, network_method_id in current_network:
                if sequence is not None:
                    sequence_network[int(sequence)] = network
                    sequence_network_method[int(sequence)] = int(network_method_id)

        # Get family data and store in dictionary
        current_families = db.engine.execute(db.select([SequenceFamilyAssociation.__table__.c.sequence_id,
                                                        SequenceFamilyAssociation.__table__.c.gene_family_id,
                                                        GeneFamily.__table__.c.method_id]).
                                             select_from(SequenceFamilyAssociation.__table__.join(GeneFamily.__table__)).
                                             where(GeneFamily.__table__.c.method_id == gene_family_method_id)
                                             ).fetchall()

        for sequence, family, method in current_families:
            sequence_family[int(sequence)] = int(family)

            if family not in family_sequence.keys():
                family_sequence[int(family)] = []

            family_sequence[int(family)].append(int(sequence))

        # Create a dict (key = network) with the families present in that network
        # Families that occur multiple times should be present multiple times as this is used
        # to set threshholds later !

        for sequence, network_method in sequence_network_method.items():
            # ignore sequences without a family, ideally this shouldn't happen
            if network_method not in network_families.keys():
                network_families[network_method] = []

            if sequence in sequence_family.keys():
                family = sequence_family[sequence]
                network_families[network_method].append(family)

        # Determine threshold and p-value
        # A background model will be computed for each combination of networks, an ECC score will need to be better
        # than 95 % of the randomly found values to be considered significant

        thresholds = {}
        print("Starting permutation tests")
        for n in network_method_ids:
            thresholds[n] = {}
            for m in network_method_ids:
                thresholds[n][m] = ExpressionNetworkMethod.__set_thresholds(network_families[n],
                                                                            network_families[m],
                                                                            max_size=max_size)

        # Data loaded start calculating ECCs
        new_ecc_scores = []

        for family, sequences in family_sequence.items():
            for i in range(len(sequences) - 1):
                query = sequences[i]
                for j in range(i+1, len(sequences)):
                    target = sequences[j]
                    if query in sequence_network.keys() and target in sequence_network.keys() and query != target:
                        # Ignore genes with overlapping neighborhoods
                        if not ExpressionNetworkMethod.__neighborhoods_overlap(sequence_network[query], sequence_network[target]):
                            ecc, significant = ExpressionNetworkMethod.__ecc(sequence_network[query],
                                                                             sequence_network[target],
                                                                             sequence_family,
                                                                             thresholds[sequence_network_method[query]][sequence_network_method[target]],
                                                                             family,
                                                                             max_size=max_size)
                            if significant:
                                new_ecc_scores.append({
                                    'query_id': query,
                                    'target_id': target,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[query],
                                    'target_network_method_id': sequence_network_method[target],
                                })

                                # add reciprocal relation
                                new_ecc_scores.append({
                                    'query_id': target,
                                    'target_id': query,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[target],
                                    'target_network_method_id': sequence_network_method[query],
                                })
                                if len(new_ecc_scores) > 400:
                                    db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)
                                    new_ecc_scores = []

        db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)

    @staticmethod
    def __neighborhoods_overlap(neighborhood_a, neighborhood_b):
        """
        Checks if two genes have overlapping networks

        :param neighborhood_a: neighborhood for first gene (string as stored in database)
        :param neighborhood_b: neighborhood for second gene (string as stored in database)
        :return: Bool, true if networks overlap
        """
        genes_a = set([n['gene_id'] for n in json.loads(neighborhood_a) if n['gene_id'] is not None])
        genes_b = set([n['gene_id'] for n in json.loads(neighborhood_b) if n['gene_id'] is not None])

        return len(genes_a.intersection(genes_b)) > 0

    @staticmethod
    def __ecc(q_network, t_network, families, thresholds, query_family, max_size=30):
        """
        Takes the networks neighborhoods (as stored in the databases), extracts the genes and find the families for
        each gene. Next the ECC score is calculated

        :param q_network: network for the query gene
        :param t_network: network for the target gene
        :param families: dictionary that links a sequence id (key) to a family id (value)
        :param thresholds:
        :param query_family: name of the input gene family
        :return: the ECC score for the two input neighborhoods given the families, a boolean flag if this is significant
        """
        q_data = json.loads(q_network)
        t_data = json.loads(t_network)

        q_genes = [t['gene_id'] for t in q_data if t['gene_id'] is not None]
        t_genes = [t['gene_id'] for t in t_data if t['gene_id'] is not None]

        q_families = [families[q] for q in q_genes if q in families.keys() and families[q] != query_family]
        t_families = [families[t] for t in t_genes if t in families.keys() and families[t] != query_family]

        # print("***\nQuery %d\n%s\n%s" % (query_family, ','.join([str(q) for q in q_families]), ','.join([str(t) for t in t_families])))

        if len(q_families) == 0 or len(t_families) == 0:
            return 0.0, False
        else:
            ecc = jaccard(q_families, t_families)

            q_size = len(set(q_families)) if len(set(q_families)) < max_size else max_size
            t_size = len(set(t_families)) if len(set(t_families)) < max_size else max_size

            t = thresholds[q_size-1][t_size-1]

            return ecc, ecc > t

    @staticmethod
    @benchmark
    def __set_thresholds(families_a, families_b, max_size=30, iterations=1000, step=5):
        """
        Empirically determine (permutation test) thresholds for ECC

        :param families_a: families of species_a (list of internal family ids)
        :param families_b: families of species_b (list of internal family ids)
        :param max_size: maximum number of families (default = 30)
        :param iterations: number of permutations done
        :param step: step size
        :return: matrix (list of lists) with the thresholds at various family sizes
        """
        thresholds = []

        for i in range(0, max_size, step):
            print("%d done" % i)
            new_threshholds = []
            for j in range(0, max_size, step):
                scores = []
                for _ in range(iterations):
                    if i+1 < len(families_a) and j+1 < len(families_b):
                        i_fams = random.sample(families_a, i+1)
                        j_fams = random.sample(families_b, j+1)
                        scores.append(jaccard(i_fams, j_fams))
                    else:
                        # Cannot calculate threshold with these families, add 1
                        scores.append(1)

                # TODO (maybe?): cutoff is hard coded here, replace ?
                print(iterations, len(scores), scores)
                scores = sorted(scores)
                for _ in range(step):
                    new_threshholds.append(scores[int(iterations*0.95)])
            for _ in range(step):
                thresholds.append(new_threshholds)

        return thresholds
Exemple #14
0
class Sequence(db.Model):
    __tablename__ = 'sequences'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    name = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    description = db.Column(db.Text)
    coding_sequence = db.deferred(db.Column(db.Text))
    type = db.Column(db.Enum('protein_coding',
                             'TE',
                             'RNA',
                             name='sequence_type'),
                     default='protein_coding')
    is_mitochondrial = db.Column(db.SmallInteger, default=False)
    is_chloroplast = db.Column(db.SmallInteger, default=False)

    expression_profiles = db.relationship('ExpressionProfile',
                                          backref=db.backref('sequence',
                                                             lazy='joined'),
                                          lazy='dynamic',
                                          cascade="all, delete-orphan",
                                          passive_deletes=True)
    network_nodes = db.relationship('ExpressionNetwork',
                                    backref=db.backref('sequence',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    # Other properties
    #
    # coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation'
    # interpro_associations declared in 'SequenceInterproAssociation'
    # go_associations declared in 'SequenceGOAssociation'
    # family_associations declared in 'SequenceFamilyAssociation'

    go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic')
    interpro_domains = db.relationship('Interpro',
                                       secondary=sequence_interpro,
                                       lazy='dynamic')
    families = db.relationship('GeneFamily',
                               secondary=sequence_family,
                               lazy='dynamic')

    coexpression_clusters = db.relationship(
        'CoexpressionCluster',
        secondary=sequence_coexpression_cluster,
        backref=db.backref('sequences', lazy='dynamic'),
        lazy='dynamic')

    ecc_query_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.query_id == Sequence.id",
        backref=db.backref('query_sequence', lazy='joined'),
        lazy='dynamic')

    ecc_target_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.target_id == Sequence.id",
        backref=db.backref('target_sequence', lazy='joined'),
        lazy='dynamic')

    clade_associations_one = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_one_id == Sequence.id",
        backref=db.backref('sequence_one', lazy='joined'),
        lazy='dynamic')

    clade_associations_two = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_two_id == Sequence.id",
        backref=db.backref('sequence_two', lazy='joined'),
        lazy='dynamic')

    xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined')

    def __init__(self,
                 species_id,
                 name,
                 coding_sequence,
                 type='protein_coding',
                 is_chloroplast=False,
                 is_mitochondrial=False,
                 description=None):
        self.species_id = species_id
        self.name = name
        self.description = description
        self.coding_sequence = coding_sequence
        self.type = type
        self.is_chloroplast = is_chloroplast
        self.is_mitochondrial = is_mitochondrial

    @property
    def protein_sequence(self):
        """
        Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and
        break after adding a stop codon (indicated by '*')

        :return: The amino acid sequence based on the coding sequence
        """
        return translate(self.coding_sequence)

    @property
    def aliases(self):
        """
        Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs

        :return: human readable string with aliases or None
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return ", ".join(t) if len(t) > 0 else None

    @property
    def shortest_alias(self):
        """
        Returns the shortest alias

        :return: string with shortest alias or None (in case no aliases exist)
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return min(t, key=len) if len(t) > 0 else None

    @property
    def display_name(self):
        """
        Returns a name to display (from xrefs with display) if available otherwise return name

        :return: display name
        """
        t = [x.name for x in self.xrefs if x.platform == 'display']

        return t[0] if len(t) > 0 else self.name

    @property
    def best_name(self):
        """
        Checks if there is a display name, if not checks the shortest alias, otherwise returns name. To be used in e.g.
        graphs

        :return: string with best name to show in graphs, ...
        """
        if self.display_name is not self.name:
            return self.display_name
        elif self.shortest_alias is not None:
            return self.shortest_alias
        else:
            return self.name

    @property
    def readable_type(self):
        """
        Converts the type table to a readable string

        :return: string with readable version of the sequence type
        """
        conversion = {
            'protein_coding': 'protein coding',
            'TE': 'transposable element',
            'RNA': 'RNA'
        }

        if self.type in conversion.keys():
            return conversion[self.type]
        else:
            return 'other'

    @staticmethod
    def add_from_fasta(filename, species_id, compressed=False):
        fasta_data = Fasta()
        fasta_data.readfile(filename, compressed=compressed)

        new_sequences = []

        # Loop over sequences, sorted by name (key here) and add to db
        for name, sequence in sorted(fasta_data.sequences.items(),
                                     key=operator.itemgetter(0)):
            new_sequence = {
                "species_id": species_id,
                "name": name,
                "description": None,
                "coding_sequence": sequence,
                "type": "protein_coding",
                "is_mitochondrial": False,
                "is_chloroplast": False
            }

            new_sequences.append(new_sequence)

            # add 400 sequences at the time, more can cause problems with some database engines
            if len(new_sequences) > 400:
                db.engine.execute(Sequence.__table__.insert(), new_sequences)
                new_sequences = []

        # add the last set of sequences
        db.engine.execute(Sequence.__table__.insert(), new_sequences)

        return len(fasta_data.sequences.keys())

    @staticmethod
    def add_descriptions(filename, species_id):
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        seq_dict = {}

        for s in sequences:
            seq_dict[s.name] = s

        with open(filename, "r") as f_in:
            for i, line in enumerate(f_in):
                try:
                    name, description = line.strip().split('\t')
                except ValueError:
                    print("Cannot parse line %d: \"%s\"" % (i, line),
                          file=sys.stderr)
                finally:
                    if name in seq_dict.keys():
                        seq_dict[name].description = description

                if i % 400 == 0:
                    db.session.commit()

            db.session.commit()

    @staticmethod
    def export_cds(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.coding_sequence), file=f_out)

    @staticmethod
    def export_protein(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.protein_sequence), file=f_out)
Exemple #15
0
class SequenceGOAssociation(db.Model):
    __tablename__ = 'sequence_go'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE'))

    evidence = db.Column(
        db.Enum('EXP',
                'IDA',
                'IPI',
                'IMP',
                'IGI',
                'IEP',
                'ISS',
                'ISO',
                'ISA',
                'ISM',
                'IGC',
                'IBA',
                'IBD',
                'IKR',
                'IRD',
                'RCA',
                'TAS',
                'NAS',
                'IC',
                'ND',
                'IEA',
                name='evidence'))
    source = db.Column(db.Text)

    predicted = db.Column(db.SmallInteger, default=False)
    prediction_data = db.Column(db.Text)

    sequence = db.relationship('Sequence',
                               backref=db.backref('go_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')

    go = db.relationship('GO',
                         backref=db.backref('sequence_associations',
                                            lazy='dynamic',
                                            passive_deletes=True),
                         lazy='joined')

    def __init__(self,
                 sequence_id,
                 go_id,
                 evidence,
                 source,
                 predicted=False,
                 prediction_data=None):
        self.sequence_id = sequence_id
        self.go_id = go_id
        self.evidence = evidence
        self.source = source
        self.predicted = predicted
        self.prediction_data = prediction_data

    @property
    def data(self):
        """
        Property to get the information in the prediction_data as a dict. Useful for showing these values in e.g. jinja2
        templates

        :return: de-serialized prediction_data (json)
        """
        return json.loads(self.prediction_data)