Exemple #1
0
class SequenceXRefAssociation(db.Model):
    __tablename__ = 'sequence_xref'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'), index=True)
    xref_id = db.Column(db.Integer, db.ForeignKey('xrefs.id', ondelete='CASCADE'), index=True)
Exemple #2
0
class FamilyXRefAssociation(db.Model):
    __tablename__ = 'family_xref'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    gene_family_id = db.Column(db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'))
    xref_id = db.Column(db.Integer, db.ForeignKey('xrefs.id', ondelete='CASCADE'))
Exemple #3
0
class News(db.Model):
    __tablename__ = 'news'
    id = db.Column(db.Integer, primary_key=True)
    message = db.Column(db.Text(collation=SQL_COLLATION))
    posted = db.Column(db.DateTime)
    posted_by = db.Column(db.String(100))

    @property
    def message_markup(self):
        return Markup(markdown(self.message))

    @property
    def posted_formatted(self):
        return self.posted.strftime("%Y-%m-%d %H:%M")
Exemple #4
0
class ClusterGOEnrichment(db.Model):
    __tablename__ = 'cluster_go_enrichment'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    cluster_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))
    go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE'))

    cluster = db.relationship('CoexpressionCluster',
                              backref=db.backref('go_enrichment',
                                                 lazy='dynamic',
                                                 passive_deletes=True),
                              lazy='joined')

    go = db.relationship('GO',
                         backref=db.backref('enriched_clusters',
                                            lazy='dynamic',
                                            passive_deletes=True),
                         lazy='joined')
    """
    Counts required to calculate the enrichment,
    store here for quick access
    """
    cluster_count = db.Column(db.Integer)
    cluster_size = db.Column(db.Integer)
    go_count = db.Column(db.Integer)
    go_size = db.Column(db.Integer)
    """
    Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric
    distribution and applying FDR correction (aka. BH)
    """
    enrichment = db.Column(db.Float)
    p_value = db.Column(db.Float)
    corrected_p_value = db.Column(db.Float)

    @property
    def cluster_percentage(self):
        return self.cluster_count * 100 / self.cluster_size

    @property
    def genome_percentage(self):
        return self.go_count * 100 / self.go_size
Exemple #5
0
class FamilyGOAssociation(db.Model):
    __tablename__ = 'family_go'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    gene_family_id = db.Column(
        db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'))
    go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE'))

    gene_family = db.relationship('GeneFamily',
                                  backref=db.backref('go_annotations',
                                                     lazy='dynamic',
                                                     passive_deletes=True),
                                  lazy='joined')

    go_term = db.relationship('GO',
                              backref=db.backref('family_associations',
                                                 lazy='dynamic',
                                                 passive_deletes=True),
                              lazy='joined')
Exemple #6
0
class SequenceFamilyAssociation(db.Model):
    __tablename__ = 'sequence_family'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    gene_family_id = db.Column(
        db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE'))

    sequence = db.relationship('Sequence',
                               backref=db.backref('family_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')
    family = db.relationship('GeneFamily',
                             backref=db.backref('sequence_associations',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined')
Exemple #7
0
class SequenceInterproAssociation(db.Model):
    __tablename__ = 'sequence_interpro'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    interpro_id = db.Column(db.Integer,
                            db.ForeignKey('interpro.id', ondelete='CASCADE'))
    start = db.Column(db.Integer, default=None)
    stop = db.Column(db.Integer, default=None)

    sequence = db.relationship('Sequence',
                               backref=db.backref('interpro_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')

    domain = db.relationship('Interpro',
                             backref=db.backref('sequence_associations',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined')
Exemple #8
0
class SequenceCoexpressionClusterAssociation(db.Model):
    __tablename__ = 'sequence_coexpression_cluster'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    probe = db.Column(db.String(50), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    coexpression_cluster_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))

    sequence = db.relationship('Sequence',
                               backref=db.backref(
                                   'coexpression_cluster_associations',
                                   lazy='dynamic',
                                   passive_deletes=True),
                               lazy='joined')
    coexpression_cluster = db.relationship('CoexpressionCluster',
                                           backref=db.backref(
                                               'sequence_associations',
                                               lazy='dynamic',
                                               passive_deletes=True),
                                           lazy='joined')
Exemple #9
0
class CoexpressionClusterSimilarity(db.Model):
    __tablename__ = 'coexpression_cluster_similarity'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    source_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))
    target_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE'))

    gene_family_method_id = db.Column('gene_family_method_id',
                                      db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    jaccard_index = db.Column(db.Float, index=True)
    p_value = db.Column(db.Float, index=True)
    corrected_p_value = db.Column(db.Float, index=True)

    source = db.relationship('CoexpressionCluster',
                             backref=db.backref('similarity_sources',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined',
                             foreign_keys=[source_id])

    target = db.relationship('CoexpressionCluster',
                             backref=db.backref('similarity_targets',
                                                lazy='dynamic',
                                                passive_deletes=True),
                             lazy='joined',
                             foreign_keys=[target_id])

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         backref=db.backref(
                                             'CoexpressionClusterSimilarities',
                                             passive_deletes=True),
                                         lazy='joined')

    @staticmethod
    def empty_table():
        """
        Delete all content from this table. Use carefully !
        """
        CoexpressionClusterSimilarity.query.delete()
Exemple #10
0
class ExpressionSpecificity(db.Model):
    __tablename__ = 'expression_specificity'

    id = db.Column(db.Integer, primary_key=True)
    profile_id = db.Column(db.Integer,
                           db.ForeignKey('expression_profiles.id',
                                         ondelete='CASCADE'),
                           index=True)
    condition = db.Column(db.String(255), index=True)
    score = db.Column(db.Float, index=True)
    entropy = db.Column(db.Float, index=True)
    tau = db.Column(db.Float, index=True)
    method_id = db.Column(db.Integer,
                          db.ForeignKey('expression_specificity_method.id',
                                        ondelete='CASCADE'),
                          index=True)
Exemple #11
0
class SequenceSequenceCladeAssociation(db.Model):
    __tablename__ = 'sequence_sequence_clade'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)

    sequence_one_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'))
    sequence_two_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'))

    clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='CASCADE'), index=True)
    tree_id = db.Column(db.Integer, db.ForeignKey('trees.id', ondelete='CASCADE'), index=True)

    duplication = db.Column(db.SmallInteger)
    duplication_consistency_score = db.Column(db.Float)

    tree = db.relationship('Tree', lazy='joined',
                           backref=db.backref('sequence_sequence_clade_associations',
                                              lazy='dynamic',
                                              passive_deletes=True)
                           )

    clade = db.relationship('Clade', lazy='joined',
                            backref=db.backref('sequence_sequence_clade_associations',
                                               lazy='dynamic',
                                               passive_deletes=True)
                            )

    def __str__(self):
        return "%d" % self.id

    @property
    def readable_type(self):
        """
        Returns type (duplication or speciation) in a human-readable format

        :return: string Duplication or Speciation
        """
        return "Duplication" if self.duplication else "Speciation"

    @property
    def readable_score(self):
        """
        Returns the duplication consistency score in a nicer format

        :return: string with dup. consistency score in .%3 - format. Or "Not available" for speciations.
        """
        return "%.3f" % self.duplication_consistency_score if self.duplication else "Not available"
Exemple #12
0
class ConditionTissue(db.Model):
    __tablename__ = 'conditions_tissue'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'))
    data = db.Column(db.Text)
    description = db.Column(db.Text)

    expression_specificity_method_id = db.Column(
        db.Integer,
        db.ForeignKey('expression_specificity_method.id', ondelete='CASCADE'),
        index=True)

    in_tree = db.Column(db.SmallInteger, default=0)

    @staticmethod
    def add(species_id,
            data,
            order,
            colors,
            expression_specificity_method_id,
            description=''):
        """
        Add conversion table to the database for a species

        :param species_id: internal id for the species
        :param data: dict with the conversion (key = condition, value = more general feature (e.g. tissue))
        :param order: list with order of the samples in the plot
        :param colors: list with colors to use in the plot
        :param expression_specificity_method_id: ID for expression specificity method
        """
        new_ct = ConditionTissue()

        new_ct.species_id = species_id
        new_ct.data = json.dumps({
            'order': order,
            'colors': colors,
            'conversion': data
        })

        new_ct.expression_specificity_method_id = expression_specificity_method_id

        new_ct.description = description

        db.session.add(new_ct)
        db.session.commit()
Exemple #13
0
class Tree(db.Model):
    __tablename__ = 'trees'
    id = db.Column(db.Integer, primary_key=True)

    label = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    data_newick = db.Column(db.Text)
    data_phyloxml = db.Column(db.Text)

    gf_id = db.Column(db.Integer,
                      db.ForeignKey('gene_families.id', ondelete='CASCADE'),
                      index=True)
    method_id = db.Column(db.Integer,
                          db.ForeignKey('tree_methods.id', ondelete='CASCADE'),
                          index=True)

    @property
    def ascii_art(self):
        """
        Returns an ascii representation of the tree. Useful for quick visualizations

        :return: string with ascii representation of the tree
        """
        tree = newick.loads(self.data_newick)[0]

        return tree.ascii_art()

    @staticmethod
    def __yattag_node(node,
                      tag,
                      text,
                      line,
                      id_to_clade,
                      seq_to_species,
                      seq_to_id,
                      root=1):
        with tag('clade'):
            if root == 1:
                line('branch_length', 0.1)
            else:
                line('branch_length', node.length)
            if node.is_leaf:
                line('name', node.name)
                if node.name in seq_to_id.keys():
                    line('id', seq_to_id[node.name])
                if node.name in seq_to_species.keys():
                    with tag('taxonomy'):
                        line('code', seq_to_species[node.name])
            else:
                clade_id, duplication, dup_score = node.name.split('_')

                clade_id = int(clade_id)
                duplication = True if duplication == 'D' else False
                dup_score = float(dup_score)

                if clade_id in id_to_clade.keys():
                    with tag('taxonomy'):
                        line('code', id_to_clade[clade_id])

                if duplication:
                    line('property',
                         str(dup_score),
                         applies_to="clade",
                         datatype="xksd:double",
                         ref="Duplication consistency score")
                    with tag('events'):
                        line('duplications', 1)
                else:
                    with tag('events'):
                        line('speciations', 1)

                for d in node.descendants:
                    Tree.__yattag_node(d,
                                       tag,
                                       text,
                                       line,
                                       id_to_clade,
                                       seq_to_species,
                                       seq_to_id,
                                       root=0)

    @property
    def phyloxml(self):
        """
        data_phyloXML to phyloXML conversion

        :return:
        """
        # Load Tree with addition information
        tree = newick.loads(self.data_phyloxml)[0]

        # Load Additional information from the database
        clades = Clade.query.all()
        id_to_clade = {c.id: c.name for c in clades}
        seq_to_species = {}
        seq_to_id = {}
        species = []

        for s in self.sequences.all():
            seq_to_id[s.name] = s.id
            seq_to_species[s.name] = s.species.code
            if s.species not in species:
                species.append(s.species)

        csep = CrossSpeciesExpressionProfile()
        csep_data = csep.get_data(*seq_to_id.values())

        has_heatmap = False
        heatmap_order = []
        for cd in csep_data:
            if "profile" in cd.keys() and "order" in cd["profile"].keys():
                has_heatmap = True
                heatmap_order = cd["profile"]["order"]
                break

        # Start constructing PhyloXML
        doc, tag, text, line = Doc().ttl()
        with tag('phyloxml'):
            with tag('phylogeny', rooted="True"):
                # line('name', self.label)
                # line('description', "PlaNet 2.0 PhyloXML tree")
                Tree.__yattag_node(tree, tag, text, line, id_to_clade,
                                   seq_to_species, seq_to_id)

            with tag('graphs'):
                if has_heatmap:
                    with tag('graph', type="heatmap"):
                        line('name', 'Heatmap')
                        with tag('legend', show=1):
                            for label in heatmap_order:
                                with tag('field'):
                                    line('name', label)
                            with tag('gradient'):
                                line('name', 'YlGnBu')
                                line('classes', len(heatmap_order))
                        with tag('data'):
                            for cd in csep_data:
                                if "profile" in cd.keys(
                                ) and "data" in cd["profile"].keys():
                                    with tag('values',
                                             **{'for':
                                                str(cd["sequence_id"])}):
                                        for label in heatmap_order:
                                            if cd["profile"]["data"][
                                                    label] is not None:
                                                line(
                                                    'value', cd["profile"]
                                                    ["data"][label])
                                            else:
                                                line('value', '')

                with tag('graph', type="binary"):
                    line('name', 'Low Expression')
                    with tag('legend', show=1):
                        with tag('field'):
                            line('name', 'Low expression')
                            line('color', '0xf03b20')
                            line('shape', 'circle')

                    with tag('data'):
                        for cd in csep_data:
                            if "low_expressed" in cd.keys():
                                with tag('values',
                                         **{'for': str(cd["sequence_id"])}):
                                    line('value', cd["low_expressed"])

                with tag('graph', type="multibar"):
                    line('name', 'Expression Range')
                    with tag('legend', show=1):
                        with tag('field'):
                            line('name', 'Max. Expression (TPM)')
                            line('color', '0x664977')

                    with tag('data'):
                        for cd in csep_data:
                            if "max_expression" in cd.keys():
                                with tag('values',
                                         **{'for': str(cd["sequence_id"])}):
                                    line('value', cd["max_expression"])

            with tag('taxonomies'):
                for s in species:
                    with tag('taxonomy', code=s.code):
                        line('color', s.color.replace("#", "0x"))
                        line('name', s.name)
                        line(
                            'url',
                            url_for('species.species_view',
                                    species_id=s.id,
                                    _external=True))

                for c in clades:
                    with tag('taxonomy', code=c.name):
                        line('color', '0x000000')
                        line('name', c.name)
                        line(
                            'url',
                            url_for('clade.clade_view',
                                    clade_id=c.id,
                                    _external=True))

        return indent(doc.getvalue())

    @property
    def count(self):
        tree = newick.loads(self.data_newick)[0]
        return len(tree.get_leaves())

    @property
    def sequences(self):
        tree = newick.loads(self.data_newick)[0]
        sequences = [l.name for l in tree.get_leaves()]

        return Sequence.query.filter(Sequence.name.in_(sequences))

    @property
    def tree_stripped(self):
        tree = newick.loads(self.data_newick)[0]
        tree.remove_lengths()

        return newick.dumps([tree])
Exemple #14
0
class TreeMethod(db.Model):
    __tablename__ = 'tree_methods'
    id = db.Column(db.Integer, primary_key=True)

    description = db.Column(db.Text)

    gene_family_method_id = db.Column(db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    trees = db.relationship('Tree',
                            backref=db.backref('method', lazy='joined'),
                            lazy='dynamic',
                            passive_deletes=True)

    def reconcile_trees(self):
        print("\n1.====================Getting into function reconcile_trees")
        # Fetch required data from the database
        sequences = Sequence.query.all()
        #print("\n1.1.=============================Sequences Joined: " + ', '.join(sequences)) #FAILS, bad print statement for list obj
        clades = Clade.query.all()
        #print("\n1.2. =========================Clades: ", *clades, sep='\n') # print works

        seq_to_species = {s.name: s.species.code for s in sequences}
        #print("\n2.=========================seq_to_species: ", *seq_to_species, sep='::')
        seq_to_id = {s.name: s.id for s in sequences}
        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        new_associations = []

        phyloxml_data = {}

        for t in self.trees:
            # Load tree from Newick string and start reconciliating
            tree = newick.loads(t.data_newick)[0]
            print("\n3.=========================tree loaded ok")

            for node in tree.walk():
                if len(node.descendants) != 2:
                    #print("\n4.==========length of node descendant=" + str(len(node.descendants)))
                    if not node.is_binary:
                        print("\n5.================Non-Binary-node: " +
                              str(node.is_binary))
                        # Print warning in case there is a non-binary node
                        #sdash: commenting out this original print statement because none binary-node doesn't have id nor label. Process stops at this print statement for non-binary trees.

                        print(
                            "Non-Binary tree: " + t.data_newick
                        )  #sdash: this print statement will show which tree is non-binary and is skipped. Doesn't stop the reconcile process.
                        #sdash May-03-2019#original#
                        #print("[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label))
                    # Otherwise it is a leaf node and can be skipped
                    continue

                branch_one_seq = [
                    l.name.strip() for l in node.descendants[0].get_leaves()
                ]
                # print("\n6.===============Branch-one-seq: " + ', '.join(branch_one_seq))
                branch_two_seq = [
                    l.name.strip() for l in node.descendants[1].get_leaves()
                ]
                # print("\n7.===============Branch-two-seq: " + ', '.join(branch_two_seq))

                branch_one_species = set([
                    seq_to_species[s] for s in branch_one_seq
                    if s in seq_to_species.keys()
                ])
                print(
                    "\n8.===============Branch-one-spp: " +
                    ', '.join(branch_one_species)
                )  #Empty set, length=0; seq_to_species length=143271; SO, problem in forming this set definition
                ## TO DO:
                #Possibly the seq name seq_to_species doesn't match in branch_one_seq and
                #  hence, it is an empty set.  Next check this possibility. Tue June 25.

                branch_two_species = set([
                    seq_to_species[s] for s in branch_two_seq
                    if s in seq_to_species.keys()
                ])
                print("\n9.===============Branch-two-spp: " +
                      ', '.join(branch_two_species))

                all_species = branch_one_species.union(branch_two_species)

                clade, _ = phylo.get_clade(all_species, clade_to_species)
                duplication = phylo.is_duplication(branch_one_species,
                                                   branch_two_species,
                                                   clade_to_species)

                duplication_consistency = None
                if duplication:
                    duplication_consistency = phylo.duplication_consistency(
                        branch_one_species, branch_two_species)

                tags = [
                    clade_to_id[clade] if clade is not None else 0,
                    'D' if duplication else 'S',
                    duplication_consistency if duplication else 0
                ]

                node.name = '_'.join([str(t) for t in tags])

                if clade is not None:
                    for seq_one in branch_one_seq:
                        for seq_two in branch_two_seq:
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_one],
                                'sequence_two_id':
                                seq_to_id[seq_two],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_two],
                                'sequence_two_id':
                                seq_to_id[seq_one],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })

            if len(new_associations) > 400:
                db.engine.execute(
                    SequenceSequenceCladeAssociation.__table__.insert(),
                    new_associations)
                new_associations = []

            # add newick tree to memory
            phyloxml_data[t.id] = newick.dumps([tree])

        db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(),
                          new_associations)

        # Update PhyloXML data file for all trees
        for t in self.trees:
            if t.id in phyloxml_data.keys():
                t.data_phyloxml = phyloxml_data[t.id]

        db.session.commit()
Exemple #15
0
class GO(db.Model):
    __tablename__ = 'go'
    id = db.Column(db.Integer, primary_key=True)
    label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True)
    name = db.Column(db.Text)
    type = db.Column(db.Enum('biological_process', 'molecular_function', 'cellular_component', name='go_type'))
    description = db.Column(db.Text)
    obsolete = db.Column(db.SmallInteger)
    is_a = db.Column(db.Text)
    extended_go = db.Column(db.Text)
    species_counts = db.Column(db.Text)

    sequences = db.relationship('Sequence', secondary=sequence_go, lazy='dynamic')

    # Other properties
    #
    # sequence_associations declared in 'SequenceGOAssociation'
    # enriched_clusters declared in 'ClusterGOEnrichment'

    def __init__(self, label, name, go_type, description, obsolete, is_a, extended_go):
        self.label = label
        self.name = name
        self.type = go_type
        self.description = description
        self.obsolete = obsolete
        self.is_a = is_a
        self.extended_go = extended_go
        self.species_counts = ""

    def set_all(self, label, name, go_type, description, extended_go):
        self.label = label
        self.name = name
        self.type = go_type
        self.description = description
        self.extended_go = extended_go
        self.species_counts = ""

    @property
    def short_type(self):
        if self.type == 'biological_process':
            return 'BP'
        elif self.type == 'molecular_function':
            return 'MF'
        elif self.type == 'cellular_component':
            return 'CC'
        else:
            return 'UNK'

    @property
    def readable_type(self):
        if self.type == 'biological_process':
            return 'Biological process'
        elif self.type == 'molecular_function':
            return 'Molecular function'
        elif self.type == 'cellular_component':
            return 'Cellular component'
        else:
            return 'Unknown type'

    @property
    def parent_count(self):
        """
        Returns total number of genes 'above' this gene in the DAG
        :return:
        """
        return len(self.extended_go.split(';')) if self.extended_go != '' else 0

    @property
    def interpro_stats(self):
        from conekt.models.interpro import Interpro

        return Interpro.sequence_stats_subquery(self.sequences)

    @property
    def go_stats(self):
        return GO.sequence_stats_subquery(self.sequences)

    @property
    def family_stats(self):
        from conekt.models.gene_families import GeneFamily

        return GeneFamily.sequence_stats_subquery(self.sequences)

    def species_occurrence(self, species_id):
        """
        count how many genes have the current GO term in a given species

        :param species_id: internal id of the selected species
        :return: count of sequences with this term associated
        """
        count = 0
        sequences = self.sequences.all()

        for s in sequences:
            if s.species_id == species_id:
                count += 1

        return count

    @staticmethod
    def sequence_stats(sequence_ids, exclude_predicted=True):
        """
        Takes a list of sequence IDs and returns InterPro stats for those sequences

        :param sequence_ids: list of sequence ids
        :param exclude_predicted: if True (default) predicted GO labels will be excluded
        :return: dict with for each InterPro domain linked with any of the input sequences stats
        """
        query = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids))

        if exclude_predicted:
            query = query.filter(SequenceGOAssociation.predicted == 0)

        data = query.all()

        return GO.__sequence_stats_associations(data)

    @staticmethod
    def sequence_stats_subquery(sequences, exclude_predicted=True):
        subquery = sequences.subquery()

        query = SequenceGOAssociation.query

        if exclude_predicted:
            query = query.filter(SequenceGOAssociation.predicted == 0)

        data = query.join(subquery, SequenceGOAssociation.sequence_id == subquery.c.id).all()

        return GO.__sequence_stats_associations(data)

    @staticmethod
    def __sequence_stats_associations(associations):
        output = {}
        for d in associations:
            if d.go_id not in output.keys():
                output[d.go_id] = {
                    'go': d.go,
                    'count': 1,
                    'sequences': [d.sequence_id],
                    'species': [d.sequence.species_id]
                }
            else:
                output[d.go_id]['count'] += 1
                if d.sequence_id not in output[d.go_id]['sequences']:
                    output[d.go_id]['sequences'].append(d.sequence_id)
                if d.sequence.species_id not in output[d.go_id]['species']:
                    output[d.go_id]['species'].append(d.sequence.species_id)

        for k, v in output.items():
            v['species_count'] = len(v['species'])
            v['sequence_count'] = len(v['sequences'])

        return output

    @staticmethod
    def update_species_counts():
        """
        Adds phylo-profile to each go-label, results are stored in the database

        :param exclude_predicted: if True (default) predicted GO labels will be excluded
        """
        # link species to sequences
        sequences = db.engine.execute(db.select([Sequence.__table__.c.id, Sequence.__table__.c.species_id])).fetchall()

        sequence_to_species = {}
        for seq_id, species_id in sequences:
            if species_id is not None:
                sequence_to_species[seq_id] = int(species_id)

        # get go for all genes
        associations = db.engine.execute(
            db.select([SequenceGOAssociation.__table__.c.sequence_id,
                       SequenceGOAssociation.__table__.c.go_id], distinct=True)\
            .where(SequenceGOAssociation.__table__.c.predicted == 0))\
            .fetchall()

        count = {}
        for seq_id, go_id in associations:
            species_id = sequence_to_species[seq_id]

            if go_id not in count.keys():
                count[go_id] = {}

            if species_id not in count[go_id]:
                count[go_id][species_id] = 1
            else:
                count[go_id][species_id] += 1

        # update counts
        for go_id, data in count.items():
            db.engine.execute(db.update(GO.__table__)
                              .where(GO.__table__.c.id == go_id)
                              .values(species_counts=json.dumps(data)))

    @staticmethod
    def add_from_obo(filename, empty=True, compressed=False):
        """
        Parses GeneOntology's OBO file and adds it to the database

        :param filename: Path to the OBO file to parse
        :param compressed: load data from .gz file if true (default: False)
        :param empty: Empty the database first when true (default: True)
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(GO).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        obo_parser = OBOParser()
        obo_parser.readfile(filename, compressed=compressed)

        obo_parser.extend_go()

        for i, term in enumerate(obo_parser.terms):
            go = GO(term.id, term.name, term.namespace, term.definition, term.is_obsolete, ";".join(term.is_a),
                    ";".join(term.extended_go))

            db.session.add(go)

            if i % 40 == 0:
                # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_go_from_plaza(filename):
        """
        Adds GO annotation from PLAZA 3.0 to the database

        :param filename: Path to the annotation file
        :return:
        """
        go_parser = GOParser()

        go_parser.read_plaza_go(filename)

        gene_hash = {}
        go_hash = {}

        all_sequences = Sequence.query.all()
        all_go = GO.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for term in all_go:
            go_hash[term.label] = term

        associations = []

        for gene, terms in go_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for term in terms:
                    if term["id"] in go_hash.keys():
                        current_term = go_hash[term["id"]]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": term["evidence"],
                            "source": term["source"]}
                        associations.append(association)
                    else:
                        print(term, "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(associations) > 400:
                db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                associations = []

        # Add extended GOs
        for gene, terms in go_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                new_terms = []
                current_terms = []

                for term in terms:
                    if term["id"] not in current_terms:
                        current_terms.append(term["id"])

                for term in terms:
                    if term["id"] in go_hash.keys():
                        extended_terms = go_hash[term["id"]].extended_go.split(";")
                        for extended_term in extended_terms:
                            if extended_term not in current_terms and extended_term not in new_terms:
                                new_terms.append(extended_term)

                for new_term in new_terms:
                    if new_term in go_hash.keys():
                        current_term = go_hash[new_term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": None,
                            "source": "Extended"}
                        associations.append(association)

                    if len(associations) > 400:
                        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                        associations = []

        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)

    @staticmethod
    def add_go_from_tab(filename, species_id, source="Source not provided"):
        gene_hash = {}
        go_hash = {}

        all_sequences = Sequence.query.filter_by(species_id=species_id).all()
        all_go = GO.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for term in all_go:
            go_hash[term.label] = term

        associations = []

        gene_go = defaultdict(list)

        with open(filename, "r") as f:
            for line in f:
                gene, term, evidence = line.strip().split('\t')
                if gene in gene_hash.keys():
                    current_sequence = gene_hash[gene]
                    if term in go_hash.keys():
                        current_term = go_hash[term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": evidence,
                            "source": source}
                        associations.append(association)

                        if term not in gene_go[gene]:
                            gene_go[gene].append(term)

                    else:
                        print(term, "not found in the database.")
                else:
                    print("Gene", gene, "not found in the database.")

                if len(associations) > 400:
                    db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                    associations = []

        # Add extended GOs
        for gene, terms in gene_go.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                new_terms = []
                current_terms = []

                for term in terms:
                    if term not in current_terms:
                        current_terms.append(term)

                for term in terms:
                    if term in go_hash.keys():
                        extended_terms = go_hash[term].extended_go.split(";")
                        for extended_term in extended_terms:
                            if extended_term not in current_terms and extended_term not in new_terms:
                                new_terms.append(extended_term)

                for new_term in new_terms:
                    if new_term in go_hash.keys():
                        current_term = go_hash[new_term]
                        association = {
                            "sequence_id": current_sequence.id,
                            "go_id": current_term.id,
                            "evidence": None,
                            "source": "Extended"}
                        associations.append(association)

                    if len(associations) > 400:
                        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)
                        associations = []

        db.engine.execute(SequenceGOAssociation.__table__.insert(), associations)

    @staticmethod
    def predict_from_network(expression_network_method_id, threshold=5, source="PlaNet Prediction"):
        """
        Function to transfer GO terms from neighbors in the network. If n or more (based on threshold) neighbors have a
        GO label (excluding other predicted labels) the term is transferred.

        :param expression_network_method_id: Expression network as input
        :param threshold: number of neighboring genes that should have the label to allow transfor
        :param source: Value for the source field
        """
        from conekt.models.expression.networks import ExpressionNetworkMethod

        expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id)

        if expression_network_method is None:
            print("ERROR: Network Method ID %d not found" % expression_network_method_id)
            return

        # Get all genes that belong to the network
        probes = expression_network_method.probes.all()

        new_associations = []

        for i, probe in enumerate(probes):
            print("Predicting GO for gene: %d, %s (%d out of %d)" %
                  (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count))

            # Get neighborhood from database
            neighborhood = json.loads(probe.network)

            # Get sequence ids from genes in first level neighborhood
            sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n]

            # If the number of genes in the neighborhood is smaller than the threshold skip (no prediction possible)
            # If there is no sequence associated with the probe skip as well
            if len(sequence_ids) < threshold or probe.sequence_id is None:
                continue

            # Get own GO terms
            own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id)
            own_terms = list(set([a.go_id for a in own_associations]))

            # Get GO terms from neighbors
            associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\
                filter(SequenceGOAssociation.predicted == 0).all()

            # Make GO terms from neighbors unique and ignore terms the current gene has already
            unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms])

            go_counts = defaultdict(lambda: 0)

            for ua in unique_associations:
                go_counts[ua[1]] += 1

            # Determine new terms (that occurred equal or more times than the desired threshold
            new_terms = [{
                'go_id': k,
                'score': v
            } for k, v in go_counts.items() if v >= threshold]

            # Store new terms in a list that can be added to the database
            for nt in new_terms:
                new_associations.append({
                    'sequence_id': probe.sequence_id,
                    'go_id': nt['go_id'],
                    'evidence': 'IEP',
                    'source': source,
                    'predicted': True,
                    'prediction_data': json.dumps({'score': nt['score'],
                                                   'threshold': threshold,
                                                   'network_method': expression_network_method_id,
                                                   'prediction_method': 'Neighbor counting'
                                                   })
                })

        # Add new labels to the database in chuncks of 400
        for i in range(0, len(new_associations), 400):
            db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])

    @staticmethod
    def predict_from_network_enrichment(expression_network_method_id, cutoff=0.05, source="PlaNet Prediction"):
        from conekt.models.expression.networks import ExpressionNetworkMethod

        expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id)

        if expression_network_method is None:
            print("ERROR: Network Method ID %d not found" % expression_network_method_id)
            return

        probes = expression_network_method.probes.all()

        # Get all GO terms and get background
        # Important, counts are obtained from precomputed counts in the species_counts field !!
        go_data = db.engine.execute(db.select([GO.__table__.c.id, GO.__table__.c.species_counts])).fetchall()

        go_background = defaultdict(lambda: 0)

        for go_id, counts_json in go_data:
            if counts_json is not "":
                counts = json.loads(counts_json)
                if str(expression_network_method.species_id) in counts.keys():
                    go_background[go_id] = counts[str(expression_network_method.species_id)]

        new_associations = []

        for i, probe in enumerate(probes):
            print("Predicting GO for gene: %d, %s (%d out of %d)" %
                  (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count))

            # Get neighborhood from database
            neighborhood = json.loads(probe.network)

            # Get sequence ids from genes in first level neighborhood
            sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n]

            # Get own GO terms
            own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id)
            own_terms = list(set([a.go_id for a in own_associations]))

            # Get GO terms from neighbors
            associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\
                filter(SequenceGOAssociation.predicted == 0).all()

            # Make GO terms from neighbors unique and ignore terms the current gene has already
            unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms])
            go_counts = defaultdict(lambda: 0)

            for ua in unique_associations:
                go_counts[ua[1]] += 1

            # find significantly enriched GO terms and store them
            enriched_go = []

            for go_id, count in go_counts.items():
                p_value = hypergeo_sf(count, len(sequence_ids), go_background[go_id], len(probes))
                if p_value < cutoff:
                    enriched_go.append((go_id, p_value))

            # apply FDR correction to the p-values
            corrected_p = fdr_correction([a[1] for a in enriched_go])

            # push new prediction in a dict that will be added to the DB
            for corrected_p, (go_id, p_value) in zip(corrected_p, enriched_go):
                new_associations.append({
                    'sequence_id': probe.sequence_id,
                    'go_id': go_id,
                    'evidence': 'IEP',
                    'source': source,
                    'predicted': True,
                    'prediction_data': json.dumps({'p-cutoff': cutoff,
                                                   'p-value': p_value,
                                                   'p-value (FDR)': corrected_p,
                                                   'network_method': expression_network_method_id,
                                                   'prediction_method': 'Neighborhood enrichment'
                                                   })
                })

        # Add new labels to the database in chuncks of 400
        for i in range(0, len(new_associations), 400):
            db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])
Exemple #16
0
class Sequence(db.Model):
    __tablename__ = 'sequences'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    name = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    description = db.Column(db.Text)
    coding_sequence = db.deferred(db.Column(db.Text))
    type = db.Column(db.Enum('protein_coding',
                             'TE',
                             'RNA',
                             name='sequence_type'),
                     default='protein_coding')
    is_mitochondrial = db.Column(db.SmallInteger, default=False)
    is_chloroplast = db.Column(db.SmallInteger, default=False)

    expression_profiles = db.relationship('ExpressionProfile',
                                          backref=db.backref('sequence',
                                                             lazy='joined'),
                                          lazy='dynamic',
                                          cascade="all, delete-orphan",
                                          passive_deletes=True)
    network_nodes = db.relationship('ExpressionNetwork',
                                    backref=db.backref('sequence',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    # Other properties
    #
    # coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation'
    # interpro_associations declared in 'SequenceInterproAssociation'
    # go_associations declared in 'SequenceGOAssociation'
    # family_associations declared in 'SequenceFamilyAssociation'

    go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic')
    interpro_domains = db.relationship('Interpro',
                                       secondary=sequence_interpro,
                                       lazy='dynamic')
    families = db.relationship('GeneFamily',
                               secondary=sequence_family,
                               lazy='dynamic')

    coexpression_clusters = db.relationship(
        'CoexpressionCluster',
        secondary=sequence_coexpression_cluster,
        backref=db.backref('sequences', lazy='dynamic'),
        lazy='dynamic')

    ecc_query_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.query_id == Sequence.id",
        backref=db.backref('query_sequence', lazy='joined'),
        lazy='dynamic')

    ecc_target_associations = db.relationship(
        'SequenceSequenceECCAssociation',
        primaryjoin="SequenceSequenceECCAssociation.target_id == Sequence.id",
        backref=db.backref('target_sequence', lazy='joined'),
        lazy='dynamic')

    clade_associations_one = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_one_id == Sequence.id",
        backref=db.backref('sequence_one', lazy='joined'),
        lazy='dynamic')

    clade_associations_two = db.relationship(
        'SequenceSequenceCladeAssociation',
        primaryjoin=
        "SequenceSequenceCladeAssociation.sequence_two_id == Sequence.id",
        backref=db.backref('sequence_two', lazy='joined'),
        lazy='dynamic')

    xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined')

    def __init__(self,
                 species_id,
                 name,
                 coding_sequence,
                 type='protein_coding',
                 is_chloroplast=False,
                 is_mitochondrial=False,
                 description=None):
        self.species_id = species_id
        self.name = name
        self.description = description
        self.coding_sequence = coding_sequence
        self.type = type
        self.is_chloroplast = is_chloroplast
        self.is_mitochondrial = is_mitochondrial

    @property
    def protein_sequence(self):
        """
        Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and
        break after adding a stop codon (indicated by '*')

        :return: The amino acid sequence based on the coding sequence
        """
        return translate(self.coding_sequence)

    @property
    def aliases(self):
        """
        Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs

        :return: human readable string with aliases or None
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return ", ".join(t) if len(t) > 0 else None

    @property
    def shortest_alias(self):
        """
        Returns the shortest alias

        :return: string with shortest alias or None (in case no aliases exist)
        """
        t = [x.name for x in self.xrefs if x.platform == 'token']

        return min(t, key=len) if len(t) > 0 else None

    @property
    def display_name(self):
        """
        Returns a name to display (from xrefs with display) if available otherwise return name

        :return: display name
        """
        t = [x.name for x in self.xrefs if x.platform == 'display']

        return t[0] if len(t) > 0 else self.name

    @property
    def best_name(self):
        """
        Checks if there is a display name, if not checks the shortest alias, otherwise returns name. To be used in e.g.
        graphs

        :return: string with best name to show in graphs, ...
        """
        if self.display_name is not self.name:
            return self.display_name
        elif self.shortest_alias is not None:
            return self.shortest_alias
        else:
            return self.name

    @property
    def readable_type(self):
        """
        Converts the type table to a readable string

        :return: string with readable version of the sequence type
        """
        conversion = {
            'protein_coding': 'protein coding',
            'TE': 'transposable element',
            'RNA': 'RNA'
        }

        if self.type in conversion.keys():
            return conversion[self.type]
        else:
            return 'other'

    @staticmethod
    def add_from_fasta(filename, species_id, compressed=False):
        fasta_data = Fasta()
        fasta_data.readfile(filename, compressed=compressed)

        new_sequences = []

        # Loop over sequences, sorted by name (key here) and add to db
        for name, sequence in sorted(fasta_data.sequences.items(),
                                     key=operator.itemgetter(0)):
            new_sequence = {
                "species_id": species_id,
                "name": name,
                "description": None,
                "coding_sequence": sequence,
                "type": "protein_coding",
                "is_mitochondrial": False,
                "is_chloroplast": False
            }

            new_sequences.append(new_sequence)

            # add 400 sequences at the time, more can cause problems with some database engines
            if len(new_sequences) > 400:
                db.engine.execute(Sequence.__table__.insert(), new_sequences)
                new_sequences = []

        # add the last set of sequences
        db.engine.execute(Sequence.__table__.insert(), new_sequences)

        return len(fasta_data.sequences.keys())

    @staticmethod
    def add_descriptions(filename, species_id):
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        seq_dict = {}

        for s in sequences:
            seq_dict[s.name] = s

        with open(filename, "r") as f_in:
            for i, line in enumerate(f_in):
                try:
                    name, description = line.strip().split('\t')
                except ValueError:
                    print("Cannot parse line %d: \"%s\"" % (i, line),
                          file=sys.stderr)
                finally:
                    if name in seq_dict.keys():
                        seq_dict[name].description = description

                if i % 400 == 0:
                    db.session.commit()

            db.session.commit()

    @staticmethod
    def export_cds(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.coding_sequence), file=f_out)

    @staticmethod
    def export_protein(filename):
        sequences = Sequence.query.options(undefer('coding_sequence')).all()

        with open(filename, "w") as f_out:
            for s in sequences:
                print(">%s\n%s" % (s.name, s.protein_sequence), file=f_out)
Exemple #17
0
"""
Tables to be used to define many-to-many relations. In case additional parameters are defined on the relationship, an
additional model needs to be created that extends these.
"""

from conekt import db

sequence_go = db.Table(
    'sequence_go', db.Column('id', db.Integer, primary_key=True),
    db.Column('sequence_id',
              db.Integer,
              db.ForeignKey('sequences.id'),
              index=True),
    db.Column('go_id', db.Integer, db.ForeignKey('go.id'), index=True))

sequence_interpro = db.Table(
    'sequence_interpro',
    db.Column('id', db.Integer, primary_key=True),
    db.Column('sequence_id',
              db.Integer,
              db.ForeignKey('sequences.id'),
              index=True),
    db.Column('interpro_id',
              db.Integer,
              db.ForeignKey('interpro.id'),
              index=True),
)

sequence_family = db.Table(
    'sequence_family', db.Column('id', db.Integer, primary_key=True),
    db.Column('sequence_id',
Exemple #18
0
class Interpro(db.Model):
    __tablename__ = 'interpro'
    id = db.Column(db.Integer, primary_key=True)
    label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True)
    description = db.Column(db.Text)

    clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='SET NULL'), index=True)

    sequences = db.relationship('Sequence', secondary=sequence_interpro, lazy='dynamic')

    # Other properties
    # sequence_associations = defined in SequenceInterproRelationship

    def __init__(self, label, description):
        self.label = label
        self.description = description

    @property
    def species_codes(self):
        """
        Finds all species the family has genes from
        :return: a list of all species (codes)
        """

        sequences = self.sequences.options(joinedload('species')).all()

        output = []

        for s in sequences:
            if s.species.code not in output:
                output.append(s.species.code)

        return output

    @property
    def species_counts(self):
        """
        Generates a phylogenetic profile of a gene family
        :return: a dict with counts per species (codes are keys)
        """

        sequences = self.sequences.options(joinedload('species')).all()

        output = {}

        for s in sequences:
            if s.species.code not in output:
                output[s.species.code] = 1
            else:
                output[s.species.code] += 1

        return output

    @staticmethod
    def sequence_stats(sequence_ids):
        """
        Takes a list of sequence IDs and returns InterPro stats for those sequences

        :param sequence_ids: list of sequence ids
        :return: dict with for each InterPro domain linked with any of the input sequences stats
        """
        data = SequenceInterproAssociation.query.filter(SequenceInterproAssociation.sequence_id.in_(sequence_ids)).all()

        return Interpro.__sequence_stats_associations(data)

    @staticmethod
    def sequence_stats_subquery(sequences):
        subquery = sequences.subquery()
        data = SequenceInterproAssociation.query.join(subquery, SequenceInterproAssociation.sequence_id == subquery.c.id).all()

        return Interpro.__sequence_stats_associations(data)

    @staticmethod
    def __sequence_stats_associations(associations):
        output = {}

        for d in associations:
            if d.interpro_id not in output.keys():
                output[d.interpro_id] = {
                    'domain': d.domain,
                    'count': 1,
                    'sequences': [d.sequence_id],
                    'species': [d.sequence.species_id]
                }
            else:
                output[d.interpro_id]['count'] += 1
                if d.sequence_id not in output[d.interpro_id]['sequences']:
                    output[d.interpro_id]['sequences'].append(d.sequence_id)
                if d.sequence.species_id not in output[d.interpro_id]['species']:
                    output[d.interpro_id]['species'].append(d.sequence.species_id)

        for k, v in output.items():
            v['species_count'] = len(v['species'])
            v['sequence_count'] = len(v['sequences'])

        return output

    @property
    def interpro_stats(self):
        sequence_ids = [s.id for s in self.sequences.all()]

        return Interpro.sequence_stats_subquery(self.sequences)

    @property
    def go_stats(self):
        from conekt.models.go import GO

        return GO.sequence_stats_subquery(self.sequences)

    @property
    def family_stats(self):
        from conekt.models.gene_families import GeneFamily

        return GeneFamily.sequence_stats_subquery(self.sequences)

    @staticmethod
    def add_from_xml(filename, empty=True):
        """
        Populates interpro table with domains and descriptions from the official website's XML file

        :param filename: path to XML file
        :param empty: If True the interpro table will be cleared before uploading the new domains, default = True
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(Interpro).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        interpro_parser = InterproParser()

        interpro_parser.readfile(filename)

        for i, domain in enumerate(interpro_parser.domains):
            interpro = Interpro(domain.label, domain.description)

            db.session.add(interpro)

            if i % 40 == 0:
                # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_interpro_from_plaza(filename):
        """
        Adds GO annotation from PLAZA 3.0 to the database

        :param filename: Path to the annotation file
        :return:
        """
        interpro_parser = InterproDomainParser()

        interpro_parser.read_plaza_interpro(filename)

        gene_hash = {}
        domain_hash = {}

        all_sequences = Sequence.query.all()
        all_domains = Interpro.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for domain in all_domains:
            domain_hash[domain.label] = domain

        new_domains = []

        for gene, domains in interpro_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for domain in domains:
                    if domain["id"] in domain_hash.keys():
                        current_domain = domain_hash[domain["id"]]

                        new_domain = {"sequence_id": current_sequence.id,
                                      "interpro_id": current_domain.id,
                                      "start": domain["start"],
                                      "stop": domain["stop"]}

                        new_domains.append(new_domain)

                    else:
                        print(domain["id"], "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(new_domains) > 400:
                db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
                new_domains = []

        db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)

    @staticmethod
    def add_interpro_from_interproscan(filename, species_id):
        """
        Adds GO annotation from InterProScan Output

        :param filename: Path to the annotation file
        :return:
        """
        interpro_parser = InterproDomainParser()

        interpro_parser.read_interproscan(filename)

        gene_hash = {}
        domain_hash = {}

        all_sequences = Sequence.query.filter_by(species_id=species_id)
        all_domains = Interpro.query.all()

        for sequence in all_sequences:
            gene_hash[sequence.name] = sequence

        for domain in all_domains:
            domain_hash[domain.label] = domain

        new_domains = []

        for gene, domains in interpro_parser.annotation.items():
            if gene in gene_hash.keys():
                current_sequence = gene_hash[gene]
                for domain in domains:
                    if domain["id"] in domain_hash.keys():
                        current_domain = domain_hash[domain["id"]]

                        new_domain = {"sequence_id": current_sequence.id,
                                      "interpro_id": current_domain.id,
                                      "start": domain["start"],
                                      "stop": domain["stop"]}

                        new_domains.append(new_domain)

                    else:
                        print(domain["id"], "not found in the database.")
            else:
                print("Gene", gene, "not found in the database.")

            if len(new_domains) > 400:
                db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
                new_domains = []

        db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
Exemple #19
0
class SequenceGOAssociation(db.Model):
    __tablename__ = 'sequence_go'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'))
    go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE'))

    evidence = db.Column(
        db.Enum('EXP',
                'IDA',
                'IPI',
                'IMP',
                'IGI',
                'IEP',
                'ISS',
                'ISO',
                'ISA',
                'ISM',
                'IGC',
                'IBA',
                'IBD',
                'IKR',
                'IRD',
                'RCA',
                'TAS',
                'NAS',
                'IC',
                'ND',
                'IEA',
                name='evidence'))
    source = db.Column(db.Text)

    predicted = db.Column(db.SmallInteger, default=False)
    prediction_data = db.Column(db.Text)

    sequence = db.relationship('Sequence',
                               backref=db.backref('go_associations',
                                                  lazy='dynamic',
                                                  passive_deletes=True),
                               lazy='joined')

    go = db.relationship('GO',
                         backref=db.backref('sequence_associations',
                                            lazy='dynamic',
                                            passive_deletes=True),
                         lazy='joined')

    def __init__(self,
                 sequence_id,
                 go_id,
                 evidence,
                 source,
                 predicted=False,
                 prediction_data=None):
        self.sequence_id = sequence_id
        self.go_id = go_id
        self.evidence = evidence
        self.source = source
        self.predicted = predicted
        self.prediction_data = prediction_data

    @property
    def data(self):
        """
        Property to get the information in the prediction_data as a dict. Useful for showing these values in e.g. jinja2
        templates

        :return: de-serialized prediction_data (json)
        """
        return json.loads(self.prediction_data)
Exemple #20
0
class Clade(db.Model):
    __tablename__ = 'clades'
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(50, collation=SQL_COLLATION),
                     unique=True,
                     index=True)
    species = db.Column(db.Text(collation=SQL_COLLATION))
    species_count = db.Column(db.Integer)
    newick_tree = db.Column(db.Text)

    families = db.relationship('GeneFamily', backref='clade', lazy='dynamic')
    interpro = db.relationship('Interpro', backref='clade', lazy='dynamic')

    def __init__(self, name, species, tree):
        self.name = name
        self.species = json.dumps(species)
        self.species_count = len(species)
        self.newick_tree = tree

    def __repr__(self):
        return str(self.id) + ". " + self.name

    @staticmethod
    def add_clade(name, species, tree):
        """
        Add a clade to the database

        :param name: name of the clade
        :param species: list with codes (!) of the species in the clade
        :param tree: newick tree for this clade. Will be stored in the database and used for visualizations
        """
        new_clade = Clade(name, species, tree)
        db.session.add(new_clade)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def add_clades_from_json(data):
        """
        Adds a clade from a dict with clade details

        :param data: dict with clade details
        """
        for c, data in data.items():
            Clade.add_clade(c, data['species'], data['tree'])

    @staticmethod
    def update_clades():
        """
        Loop over all families and determine what clade they belong too. Results are stored in the database
        """
        clades = Clade.query.all()
        families = GeneFamily.query.all()

        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        for f in families:
            family_species = f.species_codes

            # skip for families without members
            if len(family_species) == 0:
                f.clade_id = None
                continue

            # find the clade with the fewest species that contains all the codes
            selected_clade, _ = get_clade(family_species, clade_to_species)
            if selected_clade is None:
                f.clade_id = None
            else:
                f.clade_id = clade_to_id[selected_clade]

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def update_clades_interpro():
        """
        Loop over all families and determine what clade they belong too
        """
        clades = Clade.query.all()
        interpro = Interpro.query.all()

        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        for i in interpro:
            interpro_species = i.species_codes

            # skip for families without members
            if len(interpro_species) == 0:
                i.clade_id = None
                continue

            # find the clade with the fewest species that contains all the codes
            selected_clade, _ = get_clade(interpro_species, clade_to_species)
            if selected_clade is None:
                i.clade_id = None
            else:
                i.clade_id = clade_to_id[selected_clade]

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @property
    def newick_tree_species(self):
        """
        Returns a Newick tree with the species present in the current clade.

        :return: Newick tree (string) with species for the current clade
        """
        species = {s.code: s.name for s in Species.query.all()}

        tree = newick.loads(self.newick_tree)[0]

        for code, name in species.items():
            node = tree.get_node(code)
            if node is not None:
                node.name = name

        return newick.dumps([tree])
Exemple #21
0
class ExpressionNetworkMethod(db.Model):
    __tablename__ = 'expression_network_methods'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer, db.ForeignKey('species.id'), index=True)
    description = db.Column(db.Text)
    edge_type = db.Column(db.Enum("rank", "weight", name='edge_type'))
    probe_count = db.Column(db.Integer)

    hrr_cutoff = db.Column(db.Integer)
    pcc_cutoff = db.Column(db.Float)
    enable_second_level = db.Column(db.SmallInteger)

    probes = db.relationship('ExpressionNetwork',
                             backref=db.backref('method', lazy='joined'),
                             lazy='dynamic',
                             cascade="all, delete-orphan",
                             passive_deletes=True)

    clustering_methods = db.relationship('CoexpressionClusteringMethod',
                                         backref='network_method',
                                         lazy='dynamic',
                                         cascade='all, delete-orphan',
                                         passive_deletes=True)

    def __init__(self, species_id, description, edge_type="rank"):
        self.species_id = species_id
        self.description = description
        self.edge_type = edge_type
        self.enable_second_level = False

    def __repr__(self):
        return str(self.id) + ". " + self.description + ' [' + str(self.species) + ']'

    @staticmethod
    def update_count():
        """
        To avoid long count queries the number of networks for each method can be precalculated and stored in the
        database using this function
        """
        methods = ExpressionNetworkMethod.query.all()

        for m in methods:
            m.probe_count = m.probes.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    @benchmark
    def calculate_ecc(network_method_ids, gene_family_method_id, max_size=100):
        """
        Function to calculate the ECC scores in and between genes of different networks

        ORM free method for speed !

        :param network_method_ids: array of networks (using their internal id !) to compare
        :param gene_family_method_id: internal id of the type of family methods to be used for the comparison
        """

        network_families = {}
        sequence_network = {}
        sequence_network_method = {}
        sequence_family = {}
        family_sequence = {}

        # Get all the network information and store in dictionary
        for n in network_method_ids:
            current_network = db.engine.execute(db.select([ExpressionNetwork.__table__.c.sequence_id,
                                                           ExpressionNetwork.__table__.c.network,
                                                           ExpressionNetwork.__table__.c.method_id]).
                                                where(ExpressionNetwork.__table__.c.method_id == n).
                                                where(ExpressionNetwork.__table__.c.sequence_id.isnot(None))
                                                ).fetchall()

            for sequence, network, network_method_id in current_network:
                if sequence is not None:
                    sequence_network[int(sequence)] = network
                    sequence_network_method[int(sequence)] = int(network_method_id)

        # Get family data and store in dictionary
        current_families = db.engine.execute(db.select([SequenceFamilyAssociation.__table__.c.sequence_id,
                                                        SequenceFamilyAssociation.__table__.c.gene_family_id,
                                                        GeneFamily.__table__.c.method_id]).
                                             select_from(SequenceFamilyAssociation.__table__.join(GeneFamily.__table__)).
                                             where(GeneFamily.__table__.c.method_id == gene_family_method_id)
                                             ).fetchall()

        for sequence, family, method in current_families:
            sequence_family[int(sequence)] = int(family)

            if family not in family_sequence.keys():
                family_sequence[int(family)] = []

            family_sequence[int(family)].append(int(sequence))

        # Create a dict (key = network) with the families present in that network
        # Families that occur multiple times should be present multiple times as this is used
        # to set threshholds later !

        for sequence, network_method in sequence_network_method.items():
            # ignore sequences without a family, ideally this shouldn't happen
            if network_method not in network_families.keys():
                network_families[network_method] = []

            if sequence in sequence_family.keys():
                family = sequence_family[sequence]
                network_families[network_method].append(family)

        # Determine threshold and p-value
        # A background model will be computed for each combination of networks, an ECC score will need to be better
        # than 95 % of the randomly found values to be considered significant

        thresholds = {}
        print("Starting permutation tests")
        for n in network_method_ids:
            thresholds[n] = {}
            for m in network_method_ids:
                thresholds[n][m] = ExpressionNetworkMethod.__set_thresholds(network_families[n],
                                                                            network_families[m],
                                                                            max_size=max_size)

        # Data loaded start calculating ECCs
        new_ecc_scores = []

        for family, sequences in family_sequence.items():
            for i in range(len(sequences) - 1):
                query = sequences[i]
                for j in range(i+1, len(sequences)):
                    target = sequences[j]
                    if query in sequence_network.keys() and target in sequence_network.keys() and query != target:
                        # Ignore genes with overlapping neighborhoods
                        if not ExpressionNetworkMethod.__neighborhoods_overlap(sequence_network[query], sequence_network[target]):
                            ecc, significant = ExpressionNetworkMethod.__ecc(sequence_network[query],
                                                                             sequence_network[target],
                                                                             sequence_family,
                                                                             thresholds[sequence_network_method[query]][sequence_network_method[target]],
                                                                             family,
                                                                             max_size=max_size)
                            if significant:
                                new_ecc_scores.append({
                                    'query_id': query,
                                    'target_id': target,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[query],
                                    'target_network_method_id': sequence_network_method[target],
                                })

                                # add reciprocal relation
                                new_ecc_scores.append({
                                    'query_id': target,
                                    'target_id': query,
                                    'ecc': ecc,
                                    'gene_family_method_id': gene_family_method_id,
                                    'query_network_method_id': sequence_network_method[target],
                                    'target_network_method_id': sequence_network_method[query],
                                })
                                if len(new_ecc_scores) > 400:
                                    db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)
                                    new_ecc_scores = []

        db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)

    @staticmethod
    def __neighborhoods_overlap(neighborhood_a, neighborhood_b):
        """
        Checks if two genes have overlapping networks

        :param neighborhood_a: neighborhood for first gene (string as stored in database)
        :param neighborhood_b: neighborhood for second gene (string as stored in database)
        :return: Bool, true if networks overlap
        """
        genes_a = set([n['gene_id'] for n in json.loads(neighborhood_a) if n['gene_id'] is not None])
        genes_b = set([n['gene_id'] for n in json.loads(neighborhood_b) if n['gene_id'] is not None])

        return len(genes_a.intersection(genes_b)) > 0

    @staticmethod
    def __ecc(q_network, t_network, families, thresholds, query_family, max_size=30):
        """
        Takes the networks neighborhoods (as stored in the databases), extracts the genes and find the families for
        each gene. Next the ECC score is calculated

        :param q_network: network for the query gene
        :param t_network: network for the target gene
        :param families: dictionary that links a sequence id (key) to a family id (value)
        :param thresholds:
        :param query_family: name of the input gene family
        :return: the ECC score for the two input neighborhoods given the families, a boolean flag if this is significant
        """
        q_data = json.loads(q_network)
        t_data = json.loads(t_network)

        q_genes = [t['gene_id'] for t in q_data if t['gene_id'] is not None]
        t_genes = [t['gene_id'] for t in t_data if t['gene_id'] is not None]

        q_families = [families[q] for q in q_genes if q in families.keys() and families[q] != query_family]
        t_families = [families[t] for t in t_genes if t in families.keys() and families[t] != query_family]

        # print("***\nQuery %d\n%s\n%s" % (query_family, ','.join([str(q) for q in q_families]), ','.join([str(t) for t in t_families])))

        if len(q_families) == 0 or len(t_families) == 0:
            return 0.0, False
        else:
            ecc = jaccard(q_families, t_families)

            q_size = len(set(q_families)) if len(set(q_families)) < max_size else max_size
            t_size = len(set(t_families)) if len(set(t_families)) < max_size else max_size

            t = thresholds[q_size-1][t_size-1]

            return ecc, ecc > t

    @staticmethod
    @benchmark
    def __set_thresholds(families_a, families_b, max_size=30, iterations=1000, step=5):
        """
        Empirically determine (permutation test) thresholds for ECC

        :param families_a: families of species_a (list of internal family ids)
        :param families_b: families of species_b (list of internal family ids)
        :param max_size: maximum number of families (default = 30)
        :param iterations: number of permutations done
        :param step: step size
        :return: matrix (list of lists) with the thresholds at various family sizes
        """
        thresholds = []

        for i in range(0, max_size, step):
            print("%d done" % i)
            new_threshholds = []
            for j in range(0, max_size, step):
                scores = []
                for _ in range(iterations):
                    if i+1 < len(families_a) and j+1 < len(families_b):
                        i_fams = random.sample(families_a, i+1)
                        j_fams = random.sample(families_b, j+1)
                        scores.append(jaccard(i_fams, j_fams))
                    else:
                        # Cannot calculate threshold with these families, add 1
                        scores.append(1)

                # TODO (maybe?): cutoff is hard coded here, replace ?
                print(iterations, len(scores), scores)
                scores = sorted(scores)
                for _ in range(step):
                    new_threshholds.append(scores[int(iterations*0.95)])
            for _ in range(step):
                thresholds.append(new_threshholds)

        return thresholds
Exemple #22
0
class SequenceSequenceECCAssociation(db.Model):
    __tablename__ = 'sequence_sequence_ecc'
    __table_args__ = {'extend_existing': True}

    id = db.Column(db.Integer, primary_key=True)

    query_id = db.Column(db.Integer,
                         db.ForeignKey('sequences.id', ondelete='CASCADE'))
    target_id = db.Column(db.Integer,
                          db.ForeignKey('sequences.id', ondelete='CASCADE'))

    ecc = db.Column(db.Float)
    p_value = db.Column(db.Float)
    corrected_p_value = db.Column(db.Float)

    gene_family_method_id = db.Column(
        db.Integer, db.ForeignKey('gene_family_methods.id',
                                  ondelete='CASCADE'))
    query_network_method_id = db.Column(
        db.Integer,
        db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'))
    target_network_method_id = db.Column(
        db.Integer,
        db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'))

    gene_family_method = db.relationship('GeneFamilyMethod',
                                         lazy='joined',
                                         backref=db.backref(
                                             'ecc_as_family_method',
                                             lazy='dynamic',
                                             passive_deletes=True))

    query_expression_network_method = db.relationship(
        'ExpressionNetworkMethod',
        foreign_keys=[query_network_method_id],
        lazy='joined',
        backref=db.backref('ecc_as_query_method',
                           lazy='dynamic',
                           passive_deletes=True))
    target_expression_network_method = db.relationship(
        'ExpressionNetworkMethod',
        foreign_keys=[target_network_method_id],
        lazy='joined',
        backref=db.backref('ecc_as_target_method',
                           lazy='dynamic',
                           passive_deletes=True))

    @staticmethod
    def get_ecc_network(sequence, network, family):
        """
        Get network connecting a specific sequence to all genes with significant Expression Context Conservation.


        :param sequence: internal ID of sequence
        :param network: network method ID to consider
        :param family: kind of gene families used to detect ECC
        :return: network dict (can be made compatible using CytoscapeHelper)
        """
        data = SequenceSequenceECCAssociation.query.filter(
            and_(
                SequenceSequenceECCAssociation.query_id == sequence,
                SequenceSequenceECCAssociation.query_network_method_id ==
                network, SequenceSequenceECCAssociation.gene_family_method_id
                == family)).all()

        # return an empty dict in case there are no hits for this query
        if len(data) < 1:
            return {'nodes': [], 'edges': []}

        # add the query node
        d = data[0]
        nodes = [{
            "id": d.query_sequence.name,
            "name": d.query_sequence.name,
            "species_id": d.query_sequence.species_id,
            "species_name": d.query_sequence.species.name,
            "gene_id": d.query_id,
            "gene_name": d.query_sequence.name,
            "network_method_id": network,
            "node_type": "query"
        }]
        edges = []

        networks = {}

        for d in data:
            nodes.append({
                "id": d.target_sequence.name,
                "name": d.target_sequence.name,
                "species_id": d.target_sequence.species_id,
                "species_name": d.target_sequence.species.name,
                "gene_id": d.target_id,
                "network_method_id": d.target_network_method_id,
                "gene_name": d.target_sequence.name
            })

            if d.target_network_method_id not in networks.keys():
                networks[d.target_network_method_id] = []
            networks[d.target_network_method_id].append(d.target_id)

            # TODO: add p-value and corrected p once implemented
            edges.append({
                "source": d.query_sequence.name,
                "target": d.target_sequence.name,
                "ecc_score": d.ecc,
                "edge_type": 0
            })

        for n, sequences in networks.items():
            new_data = SequenceSequenceECCAssociation.query.filter(
                and_(
                    SequenceSequenceECCAssociation.query_id.in_(sequences),
                    SequenceSequenceECCAssociation.target_id.in_(sequences),
                    SequenceSequenceECCAssociation.target_network_method_id ==
                    n, SequenceSequenceECCAssociation.query_network_method_id
                    == n, SequenceSequenceECCAssociation.gene_family_method_id
                    == family, SequenceSequenceECCAssociation.query_id !=
                    SequenceSequenceECCAssociation.target_id)).all()

            for nd in new_data:
                # TODO: add p-value and corrected p once implemented
                # make sure the connection doesn't exist already
                if not any(d['source'] == nd.target_sequence.name
                           and d['target'] == nd.query_sequence.name
                           for d in edges):
                    edges.append({
                        "source": nd.query_sequence.name,
                        "target": nd.target_sequence.name,
                        "ecc_score": nd.ecc,
                        "edge_type": 1
                    })

        return {"nodes": nodes, "edges": edges}

    @staticmethod
    def get_ecc_pair_network(ecc_id):
        """
        Get all data for an SequenceSequenceECCAssociation to make a ECC graph, similar to the pairwise comparisons in
        Movahedi et al.

        :param ecc_id: interal id of the SequenceSequenceECCAssociation
        :return: ecc pair with neighborhood as graph dict
        """

        association = SequenceSequenceECCAssociation.query.get_or_404(ecc_id)

        nodes = [
            {
                "id": association.query_sequence.name,
                "name": association.query_sequence.name,
                "species_id": association.query_sequence.species_id,
                "species_name": association.query_sequence.species.name,
                "gene_id": association.query_id,
                "gene_name": association.query_sequence.name,
                "network_method_id": association.query_network_method_id,
                "node_type": "query"
            },
            {
                "id": association.target_sequence.name,
                "name": association.target_sequence.name,
                "species_id": association.target_sequence.species_id,
                "species_name": association.target_sequence.species.name,
                "gene_id": association.target_id,
                "gene_name": association.target_sequence.name,
                "network_method_id": association.target_network_method_id,
                "node_type": "query"
            },
        ]

        edges = [{
            "source": association.query_sequence.name,
            "target": association.target_sequence.name,
            "ecc_score": association.ecc,
            'ecc_pair_color': "#D33",
            "edge_type": "ecc"
        }]

        query_network = association.query_sequence.network_nodes.filter_by(
            method_id=association.query_network_method_id).first_or_404(
            ).network
        target_network = association.target_sequence.network_nodes.filter_by(
            method_id=association.target_network_method_id).first_or_404(
            ).network

        query_network_data = json.loads(query_network)
        target_network_data = json.loads(target_network)

        sequences = [
            association.query_sequence.id, association.target_sequence.id
        ]

        for n in query_network_data:
            gene_id = n['gene_id'] if 'gene_id' in n.keys() else None
            gene_name = n['gene_name'] if 'gene_name' in n.keys() else None

            if gene_id not in sequences:
                nodes.append({
                    "id":
                    gene_name,
                    "name":
                    gene_name,
                    "species_id":
                    association.query_sequence.species_id,
                    "species_name":
                    association.query_sequence.species.name,
                    "gene_id":
                    gene_id,
                    "gene_name":
                    gene_name,
                    "network_method_id":
                    association.query_network_method_id,
                    "node_type":
                    "target"
                })
                sequences.append(gene_id)

            edges.append({
                "source":
                association.query_sequence.name,
                "target":
                gene_name,
                "link_score":
                n['link_score'] if 'link_score' in n else 0,
                "edge_type":
                "expression",
                'ecc_pair_color':
                "#3D3"
            })

        for n in target_network_data:
            gene_id = n['gene_id'] if 'gene_id' in n.keys() else None
            gene_name = n['gene_name'] if 'gene_name' in n.keys() else None

            if gene_id not in sequences:
                sequences.append(gene_id)
                nodes.append({
                    "id":
                    gene_name,
                    "name":
                    gene_name,
                    "species_id":
                    association.target_sequence.species_id,
                    "species_name":
                    association.target_sequence.species.name,
                    "gene_id":
                    gene_id,
                    "gene_name":
                    gene_name,
                    "network_method_id":
                    association.target_network_method_id,
                    "node_type":
                    "target"
                })

            edges.append({
                "source":
                association.target_sequence.name,
                "target":
                gene_name,
                "link_score":
                n['link_score'] if 'link_score' in n else 0,
                "edge_type":
                "expression",
                'ecc_pair_color':
                "#3D3"
            })

        return {
            "nodes": nodes,
            "edges": edges
        }, association.gene_family_method_id

    @staticmethod
    def get_ecc_multi_network(gf_method_id, sequence_ids):
        """
        Creates an ECC network for multiple genes, the resulting network will contain all ECC partners of the input
        genes. Pruning this network keeping only genes with non-unique label co-occurances is recommended !


        :param gf_method_id: gene family method used to detect ECC
        :param sequence_ids: sequences to include as the core of the network
        :return: network dict
        """
        associations = SequenceSequenceECCAssociation.query.\
            filter(SequenceSequenceECCAssociation.gene_family_method_id == gf_method_id).\
            filter(and_(SequenceSequenceECCAssociation.query_id.in_(sequence_ids),
                        SequenceSequenceECCAssociation.target_id.in_(sequence_ids))).\
            all()

        nodes, edges = [], []
        node_sequence_ids = []

        networks = []

        for a in associations:
            query_network = a.query_sequence.network_nodes.filter_by(
                method_id=a.query_network_method_id).first_or_404().network
            target_network = a.target_sequence.network_nodes.filter_by(
                method_id=a.target_network_method_id).first_or_404().network

            if query_network not in networks:
                networks.append((a.query_id, a.query_sequence.name,
                                 a.query_sequence.species_id,
                                 a.query_sequence.species.name,
                                 a.query_network_method_id, query_network))
            if target_network not in networks:
                networks.append((a.target_id, a.target_sequence.name,
                                 a.target_sequence.species_id,
                                 a.target_sequence.species.name,
                                 a.target_network_method_id, target_network))

            if a.query_id not in node_sequence_ids:
                node_sequence_ids.append(a.query_id)
                nodes.append({
                    "id": a.query_sequence.name,
                    "name": a.query_sequence.name,
                    "species_id": a.query_sequence.species_id,
                    "species_name": a.query_sequence.species.name,
                    "gene_id": a.query_id,
                    "gene_name": a.query_sequence.name,
                    "network_method_id": a.query_network_method_id,
                    "node_type": "query"
                })

            if a.target_id not in node_sequence_ids:
                node_sequence_ids.append(a.target_id)
                nodes.append({
                    "id": a.target_sequence.name,
                    "name": a.target_sequence.name,
                    "species_id": a.target_sequence.species_id,
                    "species_name": a.target_sequence.species.name,
                    "gene_id": a.target_id,
                    "gene_name": a.target_sequence.name,
                    "network_method_id": a.target_network_method_id,
                    "node_type": "query"
                })

            edges.append({
                "source": a.query_sequence.name,
                "target": a.target_sequence.name,
                "ecc_score": a.ecc,
                'ecc_pair_color': "#D33",
                "edge_type": "ecc"
            })

        new_edges = []

        for sequence_id, sequence_name, species_id, species_name, network_method_id, n in networks:
            network_data = json.loads(n)
            for node in network_data:
                gene_id = node['gene_id'] if 'gene_id' in node.keys() else None
                gene_name = node['gene_name'] if 'gene_name' in node.keys(
                ) else None

                if gene_id not in node_sequence_ids:
                    node_sequence_ids.append(gene_id)
                    nodes.append({
                        "id": gene_name,
                        "name": gene_name,
                        "species_id": species_id,
                        "species_name": species_name,
                        "gene_id": gene_id,
                        "gene_name": gene_name,
                        "network_method_id": network_method_id,
                        "node_type": "target"
                    })

                if (sequence_name, gene_name) not in new_edges:
                    new_edges.append((sequence_name, gene_name))
                    new_edges.append((gene_name, sequence_name))

                    edges.append({
                        "source":
                        sequence_name,
                        "target":
                        gene_name,
                        "link_score":
                        node['link_score'] if 'link_score' in node else 0,
                        "edge_type":
                        "expression",
                        'ecc_pair_color':
                        "#3D3"
                    })

        return {"nodes": nodes, "edges": edges}, gf_method_id
Exemple #23
0
class ExpressionNetwork(db.Model):
    __tablename__ = 'expression_networks'
    id = db.Column(db.Integer, primary_key=True)
    probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'), index=True)
    network = db.Column(db.Text)
    method_id = db.Column(db.Integer, db.ForeignKey('expression_network_methods.id', ondelete='CASCADE'), index=True)

    def __init__(self, probe, sequence_id, network, method_id):
        self.probe = probe
        self.sequence_id = sequence_id
        self.network = network
        self.method_id = method_id

    @property
    def neighbors_count(self):
        """
        Returns the number of neighors the current gene has

        :return: int, number of neighbors
        """
        data = json.loads(self.network)

        return len(data)

    @property
    def neighbors_table(self):
        """
        Returns a tab delimited representation of the current gene's neighbors

        :return:
        """
        data = json.loads(self.network)
        output = [["Sequence", "Description", "Alias", "PCC", "hrr"]]

        # Pull in descriptions and aliases
        sequence_ids = [d["gene_id"] for d in data if "gene_id" in d.keys() and d["gene_id"] is not None]
        sequences = {s.id: s for s in Sequence.query.filter(Sequence.id.in_(sequence_ids))}

        for d in data:
            try:
                description, alias = "", ""

                if d["gene_id"] in sequences.keys():
                    description = sequences[d["gene_id"]].description
                    alias = sequences[d["gene_id"]].aliases
                    description = description if description is not None else ""
                    alias = alias if alias is not None else ""

                output.append([d["gene_name"], description, alias, str(d["link_pcc"]), str(d["hrr"])])
            except Exception as e:
                print(e)

        return '\n'.join(['\t'.join(l) for l in output])

    @staticmethod
    def get_neighborhood(probe, depth=0):
        """
        Get the coexpression neighborhood for a specific probe

        :param probe: internal ID of the probe
        :param depth: how many steps away from the query you wish to expand the network
        :return: dict with nodes and edges
        """
        node = ExpressionNetwork.query.get(probe)
        links = json.loads(node.network)

        method_id = node.method_id
        edge_type = node.method.edge_type

        # add the initial node
        nodes = [{"id": node.probe,
                  "name": node.probe,
                  "probe_id": node.id,
                  "gene_id": int(node.sequence_id) if node.sequence_id is not None else None,
                  "gene_name": node.sequence.name if node.sequence_id is not None else node.probe,
                  "node_type": "query",
                  "depth": 0}]
        edges = []

        # lists necessary for doing deeper searches
        additional_nodes = []
        existing_edges = []
        existing_nodes = [node.probe]

        # add direct neighbors of the gene of interest

        for link in links:
            nodes.append(ExpressionNetwork.__process_link(link, depth=0))
            edges.append({"source": node.probe,
                          "target": link["probe_name"],
                          "profile_comparison":
                              url_for('expression_profile.expression_profile_compare_probes',
                                      probe_a=node.probe,
                                      probe_b=link["probe_name"],
                                      species_id=node.method.species.id),
                          "depth": 0,
                          "link_score": link["link_score"],
                          "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None,
                          "hrr": link["hrr"] if "hrr" in link.keys() else None,
                          "edge_type": edge_type})
            additional_nodes.append(link["probe_name"])
            existing_edges.append([node.probe, link["probe_name"]])
            existing_edges.append([link["probe_name"], node.probe])
            existing_nodes.append(link["probe_name"])

        # iterate n times to add deeper links
        if len(additional_nodes) > 0:
            for i in range(1, depth+1):
                new_nodes = ExpressionNetwork.\
                    query.filter(and_(ExpressionNetwork.probe.in_(additional_nodes),
                                      ExpressionNetwork.method_id == method_id))
                next_nodes = []

                for new_node in new_nodes:
                    new_links = json.loads(new_node.network)

                    for link in new_links:
                        if link["probe_name"] not in existing_nodes:
                            nodes.append(ExpressionNetwork.__process_link(link, depth=depth))
                            existing_nodes.append(link["probe_name"])
                            next_nodes.append(link["probe_name"])

                        if [new_node.probe, link["probe_name"]] not in existing_edges:
                            edges.append({"source": new_node.probe,
                                          "target": link["probe_name"],
                                          "profile_comparison":
                                              url_for('expression_profile.expression_profile_compare_probes',
                                                      probe_a=new_node.probe,
                                                      probe_b=link["probe_name"],
                                                      species_id=node.method.species.id),
                                          "depth": i,
                                          "link_score": link["link_score"],
                                          "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None,
                                          "hrr": link["hrr"] if "hrr" in link.keys() else None,
                                          "edge_type": edge_type})
                            existing_edges.append([new_node.probe, link["probe_name"]])
                            existing_edges.append([link["probe_name"], new_node.probe])

                additional_nodes = next_nodes

        # Add links between the last set of nodes added
        new_nodes = []
        if len(additional_nodes) > 0:
            new_nodes = ExpressionNetwork.query.filter(and_(ExpressionNetwork.probe.in_(additional_nodes),
                                                            ExpressionNetwork.method_id == method_id))

        for new_node in new_nodes:
            new_links = json.loads(new_node.network)
            for link in new_links:
                if link["probe_name"] in existing_nodes:
                    if [new_node.probe, link["probe_name"]] not in existing_edges:
                        edges.append({"source": new_node.probe,
                                      "target": link["probe_name"],
                                      "profile_comparison":
                                          url_for('expression_profile.expression_profile_compare_probes',
                                                  probe_a=new_node.probe,
                                                  probe_b=link["probe_name"],
                                                  species_id=node.method.species.id),
                                      "depth": depth+1,
                                      "link_score": link["link_score"],
                                      "link_pcc": link["link_pcc"] if "link_pcc" in link.keys() else None,
                                      "hrr": link["hrr"] if "hrr" in link.keys() else None,
                                      "edge_type": edge_type})
                        existing_edges.append([new_node.probe, link["probe_name"]])
                        existing_edges.append([link["probe_name"], new_node.probe])

        return {"nodes": nodes, "edges": edges}

    @staticmethod
    def get_custom_network(method_id, probes):
        """
        Return a network dict for a certain set of probes/sequences. Only returns the selected nodes and connections
        between them (if any)

        :param method_id: network method to extract information from
        :param probes: list of probe/sequence names
        :return: network dict
        """
        nodes = []
        edges = []

        probes = ExpressionNetwork.query.filter(ExpressionNetwork.method_id == method_id).\
            filter(ExpressionNetwork.probe.in_(probes)).all()

        valid_nodes = []

        for p in probes:
            node = {"id": p.probe,
                    "name": p.probe,
                    "probe_id": p.id,
                    "gene_id": int(p.sequence_id) if p.sequence_id is not None else None,
                    "gene_name": p.sequence.name if p.sequence_id is not None else p.probe,
                    "node_type": "query",
                    "depth": 0}

            valid_nodes.append(p.probe)
            nodes.append(node)

        existing_edges = []

        for p in probes:
            source = p.probe
            neighborhood = json.loads(p.network)
            for n in neighborhood:
                if n["probe_name"] in valid_nodes:
                    if [source, n["probe_name"]] not in existing_edges:
                        edges.append({"source": source,
                                      "target": n["probe_name"],
                                      "profile_comparison":
                                          url_for('expression_profile.expression_profile_compare_probes',
                                                  probe_a=source,
                                                  probe_b=n["probe_name"],
                                                  species_id=p.method.species.id),
                                      "depth": 0,
                                      "link_score": n["link_score"],
                                      "link_pcc": n["link_pcc"] if "link_pcc" in n.keys() else None,
                                      "hrr": n["hrr"] if "hrr" in n.keys() else None,
                                      "edge_type": p.method.edge_type})
                        existing_edges.append([source, n["probe_name"]])
                        existing_edges.append([n["probe_name"], source])

        return {"nodes": nodes, "edges": edges}

    @staticmethod
    def __process_link(linked_probe, depth):
        """
        Internal function that processes a linked probe (from the ExpressionNetwork.network field) to a data entry
        compatible with cytoscape.js

        :param linked_probe: hash with information from ExpressionNetwork.network field
        :return: a hash formatted for use as a node with cytoscape.js
        """
        if linked_probe["gene_id"] is not None:
            return {"id": linked_probe["probe_name"],
                    "name": linked_probe["probe_name"],
                    "gene_id": linked_probe["gene_id"],
                    "gene_name": linked_probe["gene_name"],
                    "node_type": "linked",
                    "depth": depth}
        else:
            return {"id": linked_probe["probe_name"],
                    "name": linked_probe["probe_name"],
                    "gene_id": None,
                    "gene_name": linked_probe["probe_name"],
                    "node_type": "linked",
                    "depth": depth}

    @staticmethod
    def read_expression_network_lstrap(network_file, species_id, description, score_type="rank",
                                       pcc_cutoff=0.7, limit=30, enable_second_level=False):
        """
        Reads a network from disk, generated using LSTrAP, determing hrr scores for each pair and store things in the
        DB.

        :param network_file: path to input file
        :param species_id: species the data is from
        :param description: description to add to the db for this network
        :param score_type: which scores are used, default = "rank"
        :param pcc_cutoff: pcc threshold, pairs with a score below this will be ignored
        :param limit: hrr score threshold, pairs with a score above this will be ignored
        :param enable_second_level: include second level neighborhood in the database (only to be used for sparse networks)
        :return: internal ID of the new network
        """
        # build conversion table for sequences
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        sequence_dict = {}  # key = sequence name uppercase, value internal id
        for s in sequences:
            sequence_dict[s.name.upper()] = s.id

        # Add network method first
        network_method = ExpressionNetworkMethod(species_id, description, score_type)
        network_method.hrr_cutoff = limit
        network_method.pcc_cutoff = pcc_cutoff
        network_method.enable_second_level = enable_second_level

        db.session.add(network_method)

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

        network = {}
        scores = defaultdict(lambda: defaultdict(lambda: None))     # Score for non-existing pairs will be None

        with open(network_file) as fin:
            for linenr, line in enumerate(fin):
                try:
                    query, hits = line.strip().split(' ')
                    query = query.replace(':', '')
                except ValueError:
                    print("Error parsing line %d: \"%s\"" % (linenr, line))
                    # skip this line and continue
                    continue

                network[query] = {
                    "probe": query,
                    "sequence_id": sequence_dict[query.upper()] if query.upper() in sequence_dict.keys() else None,
                    "linked_probes": [],
                    "total_count": 0,
                    "method_id": network_method.id
                }

                for i, h in enumerate(hits.split('\t')):
                    try:
                        name, value = h.split('(')
                        value = float(value.replace(')', ''))
                        if value > pcc_cutoff:
                            network[query]["total_count"] += 1
                            if i < limit:
                                link = {"probe_name": name,
                                        "gene_name": name,
                                        "gene_id": sequence_dict[name.upper()] if name.upper() in sequence_dict.keys() else None,
                                        "link_score": i,
                                        "link_pcc": value}
                                network[query]["linked_probes"].append(link)
                                scores[query][name] = i
                    except ValueError as e:
                        print("Error on line %d, skipping ... (%s)" % (i, str(h)), file=sys.stderr)

        # HRR
        hr_ranks = defaultdict(lambda: defaultdict(int))

        for query, targets in scores.items():
            for target, score in targets.items():
                if None in [score, scores[target][query]]:
                    hr_ranks[query][target] = None
                else:
                    # As scores start from 0 and ranks one, increase the hrr by one
                    hr_ranks[query][target] = max(score, scores[target][query]) + 1

        # Dump dicts into network string, which will be loaded into the database
        for query in network.keys():

            for i, l in enumerate(network[query]["linked_probes"]):
                network[query]["linked_probes"][i]["hrr"] = hr_ranks[query][l["probe_name"]]

            # Dump links WITH HRR into json string
            network[query]["network"] = json.dumps([n for n in network[query]["linked_probes"] if n['hrr'] is not None])

        # add nodes in sets of 400 to avoid sending to much in a single query
        new_nodes = []
        for _, n in network.items():
            new_nodes.append(n)
            if len(new_nodes) > 400:
                db.engine.execute(ExpressionNetwork.__table__.insert(), new_nodes)
                new_nodes = []

        db.engine.execute(ExpressionNetwork.__table__.insert(), new_nodes)

        return network_method.id
Exemple #24
0
class CoexpressionClusteringMethod(db.Model):
    __tablename__ = 'coexpression_clustering_methods'
    id = db.Column(db.Integer, primary_key=True)
    network_method_id = db.Column(db.Integer,
                                  db.ForeignKey(
                                      'expression_network_methods.id',
                                      ondelete='CASCADE'),
                                  index=True)
    method = db.Column(db.Text)
    cluster_count = db.Column(db.Integer)

    clusters = db.relationship('CoexpressionCluster',
                               backref=db.backref('method', lazy='joined'),
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)

    @staticmethod
    def update_counts():
        """
        To avoid long counts the number of clusters per method can be precalculated and stored in the database
        using this function
        """
        methods = CoexpressionClusteringMethod.query.all()

        for m in methods:
            m.cluster_count = m.clusters.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def clusters_from_neighborhoods(method, network_method_id):
        probes = ExpressionNetwork.query.filter_by(
            method_id=network_method_id).all()  # Load all probes

        clusters = defaultdict(list)
        clusters_orm = {}

        sequence_to_probe = {}

        for p in probes:
            # Only consider probes linked with sequences
            if p.sequence_id is not None:
                sequence_to_probe[p.sequence_id] = p.probe
                neighborhood = json.loads(p.network)
                sequence_ids = [
                    n["gene_id"] for n in neighborhood
                    if "gene_id" in n.keys() and n["gene_id"] is not None
                ]

                # check if there are neighbors for this sequence
                if len(sequence_ids) > 0:
                    clusters[p.sequence.name] = [p.sequence_id] + sequence_ids

        # If there are valid clusters add them to the database
        if len(clusters) > 0:

            # Add new method first
            new_method = CoexpressionClusteringMethod()

            new_method.network_method_id = network_method_id
            new_method.method = method
            new_method.cluster_count = len(clusters)

            db.session.add(new_method)

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        # Add Clusters
        for cluster in clusters.keys():
            clusters_orm[cluster] = CoexpressionCluster()
            clusters_orm[cluster].method_id = new_method.id
            clusters_orm[cluster].name = cluster
            db.session.add(clusters_orm[cluster])

            if len(clusters_orm) % 400 == 0:
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

        # Add sequence cluster relations
        for i, (cluster, members) in enumerate(clusters.items()):
            for sequence_id in members:
                relation = SequenceCoexpressionClusterAssociation()
                relation.sequence_id = sequence_id
                relation.coexpression_cluster_id = clusters_orm[cluster].id
                relation.probe = sequence_to_probe[
                    sequence_id] if sequence_id in sequence_to_probe.keys(
                    ) else None

                db.session.add(relation)

            if i % 20 == 0:
                try:
                    db.session.commit()
                except Exception as e:
                    db.session.rollback()
                    print(e)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def build_hcca_clusters(method,
                            network_method_id,
                            step_size=3,
                            hrr_cutoff=30,
                            min_cluster_size=40,
                            max_cluster_size=200):
        """
        method to build HCCA clusters for a certain network

        :param method: Name for the current clustering method
        :param network_method_id: ID for the network to cluster
        :param step_size: desired step_size for the HCCA algorithm
        :param hrr_cutoff: desired hrr_cutoff for the HCCA algorithm
        :param min_cluster_size: minimal cluster size
        :param max_cluster_size: maximum cluster size
        """

        network_data = {}

        sequence_probe = {}

        # Get network from DB
        print("Loading Network data from DB...", sep='')
        ExpressionNetworkMethod.query.get_or_404(
            network_method_id)  # Check if method exists

        probes = ExpressionNetwork.query.filter_by(
            method_id=network_method_id).all()  # Load all probes

        for p in probes:
            # Loop over probes and store hrr for all neighbors
            if p.sequence_id is not None:
                neighborhood = json.loads(p.network)
                network_data[p.sequence_id] = {
                    nb["gene_id"]: nb["hrr"]
                    for nb in neighborhood if "gene_id" in nb.keys()
                    and "hrr" in nb.keys() and nb["gene_id"] is not None
                }

                sequence_probe[p.sequence_id] = p.probe

        # Double check edges are reciprocally defined
        for sequence, data in network_data.items():
            for neighbor, score in data.items():
                if neighbor not in network_data.keys():
                    network_data[neighbor] = {sequence: score}
                else:
                    if sequence not in network_data[neighbor].keys():
                        network_data[neighbor][sequence] = score

        print("Done!\nStarting to build Clusters...\n")

        # Build clusters
        hcca_util = HCCA(step_size=step_size,
                         hrr_cutoff=hrr_cutoff,
                         min_cluster_size=min_cluster_size,
                         max_cluster_size=max_cluster_size)

        hcca_util.load_data(network_data)

        hcca_util.build_clusters()

        # Add new method to DB
        clusters = list(set([t[1] for t in hcca_util.clusters]))
        if len(clusters) > 0:
            print("Done building clusters, adding clusters to DB")

            # Add new method first
            new_method = CoexpressionClusteringMethod()

            new_method.network_method_id = network_method_id
            new_method.method = method
            new_method.cluster_count = len(clusters)

            db.session.add(new_method)

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

            # Add cluster and store as dict
            cluster_dict = {}

            for c in clusters:
                cluster_dict[c] = CoexpressionCluster()
                cluster_dict[c].method_id = new_method.id
                cluster_dict[c].name = c

                db.session.add(cluster_dict[c])

            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

            # Link sequences to clusters
            for i, t in enumerate(hcca_util.clusters):
                gene_id, cluster_name, _ = t

                relation = SequenceCoexpressionClusterAssociation()

                relation.probe = sequence_probe[
                    gene_id] if gene_id in sequence_probe.keys() else None
                relation.sequence_id = gene_id
                relation.coexpression_cluster_id = cluster_dict[
                    cluster_name].id if cluster_name in cluster_dict.keys(
                    ) else None

                if relation.coexpression_cluster_id is not None:
                    db.session.add(relation)

                if i > 0 and i % 400 == 0:
                    # Add relations in sets of 400
                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)

            # Add remaining relations
            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        else:
            print("No clusters found! Not adding anything to DB !")

    @staticmethod
    def add_lstrap_coexpression_clusters(cluster_file,
                                         description,
                                         network_id,
                                         prefix='cluster_',
                                         min_size=10):
        """
        Adds MCL clusters, as produced by LSTrAP, to the database

        :param cluster_file: path to file with clusters
        :param description: description to add to database for this set of clusters
        :param network_id: network the clusters are based on
        :param prefix: prefix for individual clsuter names (default 'cluster_')
        :param min_size: minimal size of a cluster (default = 10)
        :return: ID of new clustering method
        """
        # get all sequences from the database and create a dictionary
        sequences = Sequence.query.all()

        sequence_dict = {}
        for member in sequences:
            sequence_dict[member.name.upper()] = member

        # add coexpression clustering method to the database
        clustering_method = CoexpressionClusteringMethod()

        clustering_method.network_method_id = network_id
        clustering_method.method = description

        try:
            db.session.add(clustering_method)
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)
            quit()

        with open(cluster_file) as f:
            i = 1
            for line in f:
                probes = [p for p in line.strip().split()]
                genes = [p.replace('.1', '') for p in probes]
                cluster_id = "%s%04d" % (prefix, i)

                if len(probes) >= min_size:
                    i += 1

                    new_cluster = CoexpressionCluster()
                    new_cluster.method_id = clustering_method.id
                    new_cluster.name = cluster_id

                    db.session.add(new_cluster)

                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)
                        continue

                    for p, g in zip(probes, genes):
                        new_association = SequenceCoexpressionClusterAssociation(
                        )
                        new_association.probe = p
                        new_association.sequence_id = None
                        if g.upper() in sequence_dict.keys():
                            new_association.sequence_id = sequence_dict[
                                g.upper()].id
                        new_association.coexpression_cluster_id = new_cluster.id
                        db.session.add(new_association)
                    try:
                        db.session.commit()
                    except Exception as e:
                        db.session.rollback()
                        print(e)

        return clustering_method.id
Exemple #25
0
class CoexpressionCluster(db.Model):
    __tablename__ = 'coexpression_clusters'
    id = db.Column(db.Integer, primary_key=True)
    method_id = db.Column(
        db.Integer,
        db.ForeignKey('coexpression_clustering_methods.id',
                      ondelete='CASCADE'))
    name = db.Column(db.String(50), index=True)

    # Other properties
    # sequences defined in Sequence
    # sequence_associations defined in SequenceCoexpressionClusterAssociation'
    # go_enrichment defined in ClusterGOEnrichment
    # clade_enrichment defined in ClusterCladeEnrichment

    @staticmethod
    def get_cluster(cluster_id):
        """
        Returns the network for a whole cluster (reporting edges only between members of the cluster !)

        :param cluster_id: internal ID of the cluster
        :return network for the selected cluster (dict with nodes and edges)
        """
        cluster = CoexpressionCluster.query.get(cluster_id)

        probes = [
            member.probe for member in cluster.sequence_associations.all()
        ]

        network = cluster.method.network_method.probes.\
            options(joinedload('sequence').load_only('name')).\
            filter(ExpressionNetwork.probe.in_(probes)).all()

        nodes = []
        edges = []

        existing_edges = []

        for node in network:
            nodes.append({
                "id":
                node.probe,
                "name":
                node.probe,
                "gene_id":
                int(node.sequence_id)
                if node.sequence_id is not None else None,
                "gene_name":
                node.sequence.name
                if node.sequence_id is not None else node.probe,
                "depth":
                0
            })

            links = json.loads(node.network)

            for link in links:
                # only add links that are in the cluster !
                if link["probe_name"] in probes and [
                        node.probe, link["probe_name"]
                ] not in existing_edges:
                    edges.append({
                        "source":
                        node.probe,
                        "target":
                        link["probe_name"],
                        "profile_comparison":
                        url_for(
                            'expression_profile.expression_profile_compare_probes',
                            probe_a=node.probe,
                            probe_b=link["probe_name"],
                            species_id=node.method.species.id),
                        "depth":
                        0,
                        "link_score":
                        link["link_score"],
                        "link_pcc":
                        link["link_pcc"]
                        if "link_pcc" in link.keys() else None,
                        "hrr":
                        link["hrr"] if "hrr" in link.keys() else None,
                        "edge_type":
                        cluster.method.network_method.edge_type
                    })
                    existing_edges.append([node.probe, link["probe_name"]])
                    existing_edges.append([link["probe_name"], node.probe])

        return {"nodes": nodes, "edges": edges}

    def __calculate_enrichment(self):
        """
        Initial implementation to calculate GO enrichment for a single cluster
        """
        gene_count = self.method.network_method.species.sequence_count
        species_id = self.method.network_method.species_id

        sequences = self.sequences.options(load_only("id")).all()

        associations = SequenceGOAssociation.query\
            .filter(SequenceGOAssociation.sequence_id.in_([s.id for s in sequences]))\
            .filter(SequenceGOAssociation.predicted == 0)\
            .options(load_only("sequence_id", "go_id"))\
            .group_by(SequenceGOAssociation.sequence_id, SequenceGOAssociation.go_id)

        go_data = {}

        for a in associations:
            if a.go_id not in go_data.keys():
                go_data[a.go_id] = {}
                go_data[a.go_id]["total_count"] = json.loads(
                    a.go.species_counts)[str(species_id)]
                go_data[a.go_id]["cluster_count"] = 1
            else:
                go_data[a.go_id]["cluster_count"] += 1

        p_values = []
        for go_id in go_data:
            p_values.append(
                hypergeo_sf(go_data[go_id]['cluster_count'], len(sequences),
                            go_data[go_id]['total_count'], gene_count))

        corrected_p_values = fdr_correction(p_values)

        for i, go_id in enumerate(go_data):
            enrichment = ClusterGOEnrichment()
            enrichment.cluster_id = self.id
            enrichment.go_id = go_id

            enrichment.cluster_count = go_data[go_id]['cluster_count']
            enrichment.cluster_size = len(sequences)
            enrichment.go_count = go_data[go_id]['total_count']
            enrichment.go_size = gene_count

            enrichment.enrichment = log2(
                (go_data[go_id]['cluster_count'] / len(sequences)) /
                (go_data[go_id]['total_count'] / gene_count))
            enrichment.p_value = p_values[i]
            enrichment.corrected_p_value = corrected_p_values[i]

            db.session.add(enrichment)

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    def calculate_enrichment(empty=True):
        """
        Static method to calculate the enrichment for all cluster in the database

        :param empty: empty table cluster_go_enrichment first
        """
        # If required empty the table first
        if empty:
            try:
                db.session.query(ClusterGOEnrichment).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)
            else:
                clusters = CoexpressionCluster.query.all()

                for i, cluster in enumerate(clusters):
                    # print(i, "\t cluster: ", cluster.method_id, cluster.name)
                    cluster.__calculate_enrichment()

    def __calculate_clade_enrichment(self, background, gf_method_id):
        """
        Calculates the clade enrichment for a co-expression cluster (i.e. if genes which originated in a certain clade
        are overrepresented). A background is required (how many genes there are per clade in the organism) and the
        gene family method those clades are based on.

        Calculations will be immediately committed to the DB.

        :param background: dict with background
        :param gf_method_id: internal ID of gene family method
        """
        species_gene_count = self.method.network_method.species.sequence_count
        species_id = self.method.network_method.species_id

        cluster_clade_count = defaultdict(lambda: 0)

        cluster_gene_count = self.sequences.count()

        try:
            sequences = self.sequences.\
                join(SequenceFamilyAssociation, Sequence.id == SequenceFamilyAssociation.sequence_id).\
                join(GeneFamily, SequenceFamilyAssociation.gene_family_id == GeneFamily.id).\
                add_columns(Sequence.name,
                            Sequence.species_id,
                            SequenceFamilyAssociation.gene_family_id,
                            GeneFamily.method_id,
                            GeneFamily.clade_id).\
                filter(GeneFamily.method_id == gf_method_id).all()
        except Exception as e:
            print(e, file=sys.stderr)

        for s in sequences:
            cluster_clade_count[s.clade_id] += 1

        enrichment_scores = []

        for clade_id, count in cluster_clade_count.items():
            try:
                background_count = background[species_id][clade_id]
                p_value = hypergeo_sf(count, cluster_gene_count,
                                      background_count, species_gene_count)
                enrichment = log2((count / cluster_gene_count) /
                                  (background_count / species_gene_count))

                enrichment_scores.append({
                    'clade_count': background_count,
                    'clade_size': species_gene_count,
                    'cluster_count': count,
                    'cluster_size': cluster_gene_count,
                    'p_value': p_value,
                    'enrichment': enrichment,
                    'clade_id': clade_id,
                    'cluster_id': self.id
                })

            except Exception as e:
                print(e, file=sys.stderr)

        corrected_p_values = fdr_correction(
            [es['p_value'] for es in enrichment_scores])

        commit_required = False
        for es, corrected_p_value in zip(enrichment_scores,
                                         corrected_p_values):
            if es['p_value'] < 0.05 and es['enrichment'] > 0:
                commit_required = True
                cluster_clade_enrichment = ClusterCladeEnrichment()
                cluster_clade_enrichment.p_value = es['p_value']
                cluster_clade_enrichment.corrected_p_value = corrected_p_value
                cluster_clade_enrichment.enrichment = es['enrichment']
                cluster_clade_enrichment.clade_id = es['clade_id']
                cluster_clade_enrichment.cluster_id = es['cluster_id']
                cluster_clade_enrichment.gene_family_method_id = gf_method_id
                cluster_clade_enrichment.clade_count = es['clade_count']
                cluster_clade_enrichment.clade_size = es['clade_size']
                cluster_clade_enrichment.cluster_count = es['cluster_count']
                cluster_clade_enrichment.cluster_size = es['cluster_size']

                db.session.add(cluster_clade_enrichment)

        if commit_required:
            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

    @staticmethod
    def calculate_clade_enrichment(gene_family_method_id, empty=True):
        """
        Calculates clade enrichment for co-expression clusters

        :param gene_family_method_id: gene family method to use to determine clades
        :param empty: when true, removes clade enrichments for the current gf_method
        """
        if empty:
            try:
                print("Removing Existing Enrichment")
                db.session.query(ClusterCladeEnrichment).\
                    filter(ClusterCladeEnrichment.gene_family_method_id == gene_family_method_id).delete()
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                print(e)

        print("Calculating background...", sep='')
        gf_method = GeneFamilyMethod.query.get(gene_family_method_id)
        counts = gf_method.get_clade_distribution()
        print(' Done!')

        # calculate enrichment
        print("Calculate enrichment", sep='')

        clusters = CoexpressionCluster.query.all()

        for i, cluster in enumerate(clusters):
            print(i, "\t cluster: ", cluster.method_id, cluster.name)
            cluster.__calculate_clade_enrichment(counts, gene_family_method_id)

        print(" Done!")

    @staticmethod
    def delete_enrichment():
        """
        Removes all GO enrichment data from the database

        :return:
        """
        try:
            db.session.query(ClusterGOEnrichment).delete()
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)

    @staticmethod
    @benchmark
    def calculate_similarities(gene_family_method_id=1, percentile_pass=0.95):
        """
        This function will calculate ALL similarities between clusters in the database. Results will be added to the
        DB

        :param gene_family_method_id: Internal ID of gene family method to use to calculate the scores (default = 1)
        :param percentile_pass: percentile based cutoff (default = 0.95)
        """

        # sqlalchemy to fetch cluster associations
        fields = [
            SequenceCoexpressionClusterAssociation.__table__.c.sequence_id,
            SequenceCoexpressionClusterAssociation.__table__.c.
            coexpression_cluster_id
        ]
        condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None
        cluster_associations = db.engine.execute(
            db.select(fields).where(condition)).fetchall()

        # sqlalchemy to fetch sequence family associations
        fields = [
            SequenceFamilyAssociation.__table__.c.sequence_id,
            SequenceFamilyAssociation.__table__.c.gene_family_id,
            GeneFamily.__table__.c.method_id
        ]
        condition = GeneFamily.__table__.c.method_id == gene_family_method_id
        table = join(
            SequenceFamilyAssociation.__table__, GeneFamily.__table__,
            SequenceFamilyAssociation.__table__.c.gene_family_id ==
            GeneFamily.__table__.c.id)
        sequence_families = db.engine.execute(
            db.select(fields).select_from(table).where(condition)).fetchall()

        # convert sqlachemy results into dictionary
        sequence_to_family = {
            seq_id: fam_id
            for seq_id, fam_id, method_id in sequence_families
        }

        cluster_to_sequences = {}
        cluster_to_families = {}

        for seq_id, cluster_id in cluster_associations:
            if cluster_id not in cluster_to_sequences.keys():
                cluster_to_sequences[cluster_id] = []
            cluster_to_sequences[cluster_id].append(seq_id)

        for cluster_id, sequences in cluster_to_sequences.items():
            families = list(
                set([
                    sequence_to_family[s] for s in sequences
                    if s in sequence_to_family.keys()
                ]))
            if len(families) > 0:
                cluster_to_families[cluster_id] = families

        keys = list(cluster_to_families.keys())

        data = []

        for i in range(len(keys) - 1):
            for j in range(i + 1, len(keys)):
                current_keys = [keys[x] for x in [i, j]]
                current_families = [
                    cluster_to_families[k] for k in current_keys
                ]

                if len(current_families[0]) > 4 and len(
                        current_families[1]) > 4:
                    j = jaccard(current_families[0], current_families[1])
                    data.append([current_keys[0], current_keys[1], j])

        ordered_j = sorted([a[2] for a in data])
        if len(ordered_j) > 0:
            percentile_cutoff = ordered_j[int(
                len(ordered_j) * percentile_pass)]

            database = [{
                'source_id': d[0],
                'target_id': d[1],
                'gene_family_method_id': gene_family_method_id,
                'jaccard_index': d[2],
                'p_value': 0,
                'corrected_p_value': 0
            } for d in data if d[2] >= percentile_cutoff]

            db.engine.execute(CoexpressionClusterSimilarity.__table__.insert(),
                              database)
        else:
            print("No similar clusters found!")

    @property
    def profiles(self):
        """
        Returns a list with all expression profiles of cluster members
        :return: list of all profiles
        """

        sequence_subquery = self.sequences.subquery()

        profiles = ExpressionProfile.query.\
            options(undefer('profile')).\
            join(sequence_subquery, ExpressionProfile.sequence_id == sequence_subquery.c.id).all()

        return profiles

    @property
    def interpro_stats(self):
        """
        Get InterPro statistics for the current cluster

        :return: Interpro statistics
        """
        sequence_ids = [s.id for s in self.sequences.all()]

        return Interpro.sequence_stats(sequence_ids)

    @property
    def go_stats(self):
        """
        Get GO statistics for the current cluster

        :return: GO statistics
        """
        sequence_ids = [s.id for s in self.sequences.all()]

        return GO.sequence_stats(sequence_ids)

    @property
    def family_stats(self):
        """
        Get gene family statistics for the current cluster

        :return: gene family statistics
        """
        sequence_ids = [s.id for s in self.sequences.all()]

        return GeneFamily.sequence_stats(sequence_ids)
Exemple #26
0
class User(db.Model):
    __tablename__ = 'users'
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(50), unique=True, index=True)
    first_name = db.Column(db.String(50))
    last_name = db.Column(db.String(50))
    password_hash = db.Column(db.Text)
    email = db.Column(db.Text)
    reset_key = db.Column(db.Text)
    is_admin = db.Column(db.SmallInteger)
    is_banned = db.Column(db.SmallInteger)
    wants_newsletter = db.Column(db.SmallInteger)
    registered = db.Column(db.DateTime)

    def __init__(self,
                 username,
                 password,
                 email,
                 reset_key='',
                 is_admin=False,
                 is_banned=False,
                 registered=datetime.now().replace(microsecond=0)):
        self.username = username
        self.password_hash = generate_password_hash(password)
        self.email = email
        self.reset_key = reset_key
        self.is_admin = is_admin
        self.is_banned = is_banned
        self.registered = registered
        self.wants_newsletter = False

    def __repr__(self):
        return '<User %d>' % self.id

    def check_password(self, password):
        return check_password_hash(self.password_hash, password)

    @property
    def is_administrator(self):
        return self.is_admin

    @property
    def is_authenticated(self):
        return True

    @property
    def is_active(self):
        return True

    @property
    def is_anonymous(self):
        return False

    def get_id(self):
        return str(self.id)

    @staticmethod
    def get(user_id):
        return User.query.get(user_id)
Exemple #27
0
class Species(db.Model):
    __tablename__ = 'species'
    id = db.Column(db.Integer, primary_key=True)
    code = db.Column(db.String(50, collation=SQL_COLLATION), unique=True)
    name = db.Column(db.String(200, collation=SQL_COLLATION))
    data_type = db.Column(db.Enum('genome', 'transcriptome', name='data_type'))
    color = db.Column(db.String(7), default="#C7C7C7")
    highlight = db.Column(db.String(7), default="#DEDEDE")
    sequence_count = db.Column(db.Integer)
    network_count = db.Column(db.Integer)
    profile_count = db.Column(db.Integer)
    description = db.Column(db.Text)

    sequences = db.relationship('Sequence',
                                backref='species',
                                lazy='dynamic',
                                cascade="all, delete-orphan",
                                passive_deletes=True)
    networks = db.relationship('ExpressionNetworkMethod',
                               backref='species',
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)
    profiles = db.relationship('ExpressionProfile',
                               backref='species',
                               lazy='dynamic',
                               cascade="all, delete-orphan",
                               passive_deletes=True)
    expression_specificities = db.relationship('ExpressionSpecificityMethod',
                                               backref='species',
                                               lazy='dynamic',
                                               cascade="all, delete-orphan",
                                               passive_deletes=True)
    condition_tissues = db.relationship('ConditionTissue',
                                        backref='species',
                                        lazy='dynamic',
                                        cascade="all, delete-orphan",
                                        passive_deletes=True)

    def __init__(self,
                 code,
                 name,
                 data_type='genome',
                 color="#C7C7C7",
                 highlight="#DEDEDE",
                 description=None):
        self.code = code
        self.name = name
        self.data_type = data_type
        self.color = color
        self.highlight = highlight
        self.sequence_count = 0
        self.profile_count = 0
        self.network_count = 0
        self.description = description

    def __repr__(self):
        return str(self.id) + ". " + self.name

    @property
    def has_interpro(self):
        from conekt.models.sequences import Sequence
        from conekt.models.relationships.sequence_interpro import SequenceInterproAssociation

        domain = SequenceInterproAssociation.query.join(
            Sequence,
            Sequence.id == SequenceInterproAssociation.sequence_id).filter(
                Sequence.species_id == self.id).first()

        if domain is not None:
            return True
        else:
            return False

    @property
    def has_go(self):
        from conekt.models.sequences import Sequence
        from conekt.models.relationships.sequence_go import SequenceGOAssociation

        go = SequenceGOAssociation.query.join(
            Sequence, Sequence.id == SequenceGOAssociation.sequence_id).filter(
                Sequence.species_id == self.id).first()

        if go is not None:
            return True
        else:
            return False

    @staticmethod
    def add(code,
            name,
            data_type='genome',
            color="#C7C7C7",
            highlight="#DEDEDE",
            description=None):

        new_species = Species(code,
                              name,
                              data_type=data_type,
                              color=color,
                              highlight=highlight,
                              description=description)

        species = Species.query.filter_by(code=code).first()

        # species is not in the DB yet, add it
        if species is None:
            try:
                db.session.add(new_species)
                db.session.commit()
            except:
                db.rollback()

            return new_species.id
        else:
            return species.id

    @staticmethod
    def update_counts():
        """
        To avoid long counts the number of sequences, profiles and networks can be precalculated and stored in the
        database using this function.
        """
        species = Species.query.all()

        for s in species:
            s.sequence_count = s.sequences.count()
            s.profile_count = s.profiles.count()
            s.network_count = s.networks.count()

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            print(e)
Exemple #28
0
class ExpressionProfile(db.Model):
    __tablename__ = 'expression_profiles'
    id = db.Column(db.Integer, primary_key=True)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)
    probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True)
    sequence_id = db.Column(db.Integer,
                            db.ForeignKey('sequences.id', ondelete='CASCADE'),
                            index=True)
    profile = db.deferred(db.Column(db.Text))

    specificities = db.relationship('ExpressionSpecificity',
                                    backref=db.backref('profile',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    def __init__(self, probe, sequence_id, profile):
        self.probe = probe
        self.sequence_id = sequence_id
        self.profile = profile

    @staticmethod
    def __profile_to_table(data):
        """
        Internal function to convert an expression profile (dict) to a tabular text

        :param data: Dict with expression profile
        :return: table (string)
        """
        output = [["condition", "mean", "min", "max"]]
        order = data["order"]

        for o in order:
            try:
                values = data["data"][o]
                output.append(
                    [o,
                     str(mean(values)),
                     str(min(values)),
                     str(max(values))])
            except Exception as e:
                print(e)

        return '\n'.join(['\t'.join(l) for l in output])

    @property
    def table(self):
        """
        Returns the condition expression as a tabular text file

        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(json.loads(self.profile))

        return table

    def tissue_table(self, condition_tissue_id, use_means=True):
        """
        Returns the tissue expression as a tabular text file

        :param condition_tissue_id: condition_tissue_id for the conversion
        :param use_means: Use the mean of the condition (recommended)
        :return: table with data (string)
        """
        table = ExpressionProfile.__profile_to_table(
            self.tissue_profile(condition_tissue_id, use_means=use_means))
        return table

    @property
    def low_abundance(self, cutoff=10):
        """
        Checks if the mean expression value in any conditions in the plot is higher than the desired cutoff

        :param cutoff: cutoff for expression, default = 10
        :return: True in case of low abundance otherwise False
        """
        data = json.loads(self.profile)

        checks = [mean(v) > cutoff for _, v in data["data"].items()]

        return not any(checks)

    @staticmethod
    def convert_profile(condition_to_tissue, profile_data, use_means=True):
        """
        Convert a full, detailed profile into a more general summarized one using conversion table stored in the
        database

        :param condition_to_tissue: dict with conversion instructions
        :param profile_data: profile to convert
        :param use_means: use means of detailed condition if True otherwise use samples independently. Default True
        :return: New profile
        """
        tissues = list(set(condition_to_tissue['conversion'].values()))

        output = {}

        for t in tissues:
            valid_conditions = [
                k for k in profile_data['data']
                if k in condition_to_tissue['conversion']
                and condition_to_tissue['conversion'][k] == t
            ]
            valid_values = []
            for k, v in profile_data['data'].items():
                if k in valid_conditions:
                    if use_means:
                        valid_values.append(mean(v))
                    else:
                        valid_values += v

            output[t] = valid_values if len(valid_values) > 0 else [0]

        return {
            'order': condition_to_tissue['order'],
            'colors': condition_to_tissue['colors'],
            'data': output
        }

    def tissue_profile(self, condition_tissue_id, use_means=True):
        """
        Applies a conversion to the profile, grouping several condition into one more general feature (e.g. tissue).

        :param condition_tissue_id: identifier of the conversion table
        :param use_means: store the mean of the condition rather than individual values. The matches the spm
        calculations better.
        :return: parsed profile
        """
        ct = ConditionTissue.query.get(condition_tissue_id)

        condition_to_tissue = json.loads(ct.data)
        profile_data = json.loads(self.profile)

        output = ExpressionProfile.convert_profile(condition_to_tissue,
                                                   profile_data,
                                                   use_means=use_means)

        return output

    @staticmethod
    def get_heatmap(species_id, probes, zlog=True, raw=False):
        """
        Returns a heatmap for a given species (species_id) and a list of probes. It returns a dict with 'order'
        the order of the experiments and 'heatmap' another dict with the actual data. Data is zlog transformed

        :param species_id: species id (internal database id)
        :param probes: a list of probes to include in the heatmap
        :param zlog: enable zlog transformation (otherwise normalization against highest expressed condition)
        """
        profiles = ExpressionProfile.query.options(undefer('profile')).filter_by(species_id=species_id).\
            filter(ExpressionProfile.probe.in_(probes)).all()

        order = []

        output = []

        not_found = [p.lower() for p in probes]

        for profile in profiles:
            name = profile.probe
            data = json.loads(profile.profile)
            order = data['order']
            experiments = data['data']

            with contextlib.suppress(ValueError):
                not_found.remove(profile.probe.lower())

            with contextlib.suppress(ValueError):
                not_found.remove(profile.sequence.name.lower())

            values = {}

            for o in order:
                values[o] = mean(experiments[o])

            row_mean = mean(values.values())
            row_max = max(values.values())

            for o in order:
                if zlog:
                    if row_mean == 0 or values[o] == 0:
                        values[o] = '-'
                    else:
                        try:
                            values[o] = log(values[o] / row_mean, 2)
                        except ValueError as _:
                            print("Unable to calculate log()", values[o],
                                  row_mean)
                            values[o] = '-'
                else:
                    if row_max != 0 and not raw:
                        values[o] = values[o] / row_max

            output.append({
                "name": name,
                "values": values,
                "sequence_id": profile.sequence_id,
                "shortest_alias": profile.sequence.shortest_alias
            })

        if len(not_found) > 0:
            flash("Couldn't find profile for: %s" % ", ".join(not_found),
                  "warning")

        return {'order': order, 'heatmap_data': output}

    @staticmethod
    def get_profiles(species_id, probes, limit=1000):
        """
        Gets the data for a set of probes (including the full profiles), a limit can be provided to avoid overly
        long queries

        :param species_id: internal id of the species
        :param probes: probe names to fetch
        :param limit: maximum number of probes to get
        :return: List of ExpressionProfile objects including the full profiles
        """
        profiles = ExpressionProfile.query.\
            options(undefer('profile')).\
            filter(ExpressionProfile.probe.in_(probes)).\
            filter_by(species_id=species_id).\
            options(joinedload('sequence').load_only('name').noload('xrefs')).\
            limit(limit).all()

        return profiles

    @staticmethod
    def add_profile_from_lstrap(matrix_file,
                                annotation_file,
                                species_id,
                                order_color_file=None):
        """
        Function to convert an (normalized) expression matrix (lstrap output) into a profile

        :param matrix_file: path to the expression matrix
        :param annotation_file: path to the file assigning samples to conditions
        :param species_id: internal id of the species
        :param order_color_file: tab delimited file that contains the order and color of conditions
        """
        annotation = {}

        with open(annotation_file, 'r') as fin:
            # get rid of the header
            _ = fin.readline()

            for line in fin:
                parts = line.strip().split('\t')
                if len(parts) > 1:
                    run, description = parts
                    annotation[run] = description

        order, colors = [], []
        if order_color_file is not None:
            with open(order_color_file, 'r') as fin:
                for line in fin:
                    try:
                        o, c = line.strip().split('\t')
                        order.append(o)
                        colors.append(c)
                    except Exception as _:
                        pass

        # build conversion table for sequences
        sequences = Sequence.query.filter_by(species_id=species_id).all()

        sequence_dict = {}  # key = sequence name uppercase, value internal id
        for s in sequences:
            sequence_dict[s.name.upper()] = s.id

        with open(matrix_file) as fin:
            # read header
            _, *colnames = fin.readline().rstrip().split()

            colnames = [c.replace('.htseq', '') for c in colnames]

            # determine order after annotation is not defined
            if order is None:
                order = []

                for c in colnames:
                    if c in annotation.keys():
                        if annotation[c] not in order:
                            order.append(annotation[c])

                order.sort()

            # read each line and build profile
            new_probes = []
            for line in fin:
                transcript, *values = line.rstrip().split()
                profile = defaultdict(list)

                for c, v in zip(colnames, values):
                    if c in annotation.keys():
                        condition = annotation[c]
                        profile[condition].append(float(v))

                new_probe = {
                    "species_id":
                    species_id,
                    "probe":
                    transcript,
                    "sequence_id":
                    sequence_dict[transcript.upper()]
                    if transcript.upper() in sequence_dict.keys() else None,
                    "profile":
                    json.dumps({
                        "order": order,
                        "colors": colors,
                        "data": profile
                    })
                }

                new_probes.append(new_probe)

                if len(new_probes) > 400:
                    db.engine.execute(ExpressionProfile.__table__.insert(),
                                      new_probes)
                    new_probes = []

            db.engine.execute(ExpressionProfile.__table__.insert(), new_probes)
Exemple #29
0
class TreeMethod(db.Model):
    __tablename__ = 'tree_methods'
    id = db.Column(db.Integer, primary_key=True)

    description = db.Column(db.Text)

    gene_family_method_id = db.Column(db.Integer,
                                      db.ForeignKey('gene_family_methods.id',
                                                    ondelete='CASCADE'),
                                      index=True)

    trees = db.relationship('Tree',
                            backref=db.backref('method', lazy='joined'),
                            lazy='dynamic',
                            passive_deletes=True)

    def reconcile_trees(self):
        # Fetch required data from the database
        sequences = Sequence.query.all()
        clades = Clade.query.all()

        seq_to_species = {s.name: s.species.code for s in sequences}
        seq_to_id = {s.name: s.id for s in sequences}
        clade_to_species = {c.name: json.loads(c.species) for c in clades}
        clade_to_id = {c.name: c.id for c in clades}

        new_associations = []

        phyloxml_data = {}

        for t in self.trees:
            # Load tree from Newick string and start reconciliating
            tree = newick.loads(t.data_newick)[0]

            for node in tree.walk():
                if len(node.descendants) != 2:
                    if not node.is_binary:
                        # Print warning in case there is a non-binary node
                        print(
                            "[%d, %s] Skipping node... Can only reconcile binary nodes ..."
                            % (tree.id, tree.label))
                    # Otherwise it is a leaf node and can be skipped
                    continue

                branch_one_seq = [
                    l.name.strip() for l in node.descendants[0].get_leaves()
                ]
                branch_two_seq = [
                    l.name.strip() for l in node.descendants[1].get_leaves()
                ]

                branch_one_species = set([
                    seq_to_species[s] for s in branch_one_seq
                    if s in seq_to_species.keys()
                ])
                branch_two_species = set([
                    seq_to_species[s] for s in branch_two_seq
                    if s in seq_to_species.keys()
                ])

                all_species = branch_one_species.union(branch_two_species)

                clade, _ = phylo.get_clade(all_species, clade_to_species)
                duplication = phylo.is_duplication(branch_one_species,
                                                   branch_two_species,
                                                   clade_to_species)

                duplication_consistency = None
                if duplication:
                    duplication_consistency = phylo.duplication_consistency(
                        branch_one_species, branch_two_species)

                tags = [
                    clade_to_id[clade] if clade is not None else 0,
                    'D' if duplication else 'S',
                    duplication_consistency if duplication else 0
                ]

                node.name = '_'.join([str(t) for t in tags])

                if clade is not None:
                    for seq_one in branch_one_seq:
                        for seq_two in branch_two_seq:
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_one],
                                'sequence_two_id':
                                seq_to_id[seq_two],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })
                            new_associations.append({
                                'sequence_one_id':
                                seq_to_id[seq_two],
                                'sequence_two_id':
                                seq_to_id[seq_one],
                                'tree_id':
                                t.id,
                                'clade_id':
                                clade_to_id[clade],
                                'duplication':
                                1 if duplication else 0,
                                'duplication_consistency_score':
                                duplication_consistency
                            })

            if len(new_associations) > 400:
                db.engine.execute(
                    SequenceSequenceCladeAssociation.__table__.insert(),
                    new_associations)
                new_associations = []

            # add newick tree to memory
            phyloxml_data[t.id] = newick.dumps([tree])

        db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(),
                          new_associations)

        # Update PhyloXML data file for all trees
        for t in self.trees:
            if t.id in phyloxml_data.keys():
                t.data_phyloxml = phyloxml_data[t.id]

        db.session.commit()
Exemple #30
0
class ExpressionSpecificityMethod(db.Model):
    __tablename__ = 'expression_specificity_method'

    id = db.Column(db.Integer, primary_key=True)
    description = db.Column(db.Text)
    conditions = db.Column(db.Text)
    species_id = db.Column(db.Integer,
                           db.ForeignKey('species.id', ondelete='CASCADE'),
                           index=True)

    specificities = db.relationship('ExpressionSpecificity',
                                    backref='method',
                                    lazy='dynamic',
                                    cascade="all, delete-orphan",
                                    passive_deletes=True)

    condition_tissue = db.relationship('ConditionTissue',
                                       backref='expression_specificity_method',
                                       lazy='joined',
                                       cascade="all, delete-orphan",
                                       passive_deletes=True,
                                       uselist=False)

    menu_order = db.Column(db.Integer)

    def __repr__(self):
        return str(
            self.id) + ". " + self.description + ' [' + self.species.name + ']'

    @staticmethod
    def calculate_specificities(species_id,
                                description,
                                remove_background=False):
        """
        Function that calculates condition specificities for each profile. No grouping is applied, each condition is
        used as is

        :param species_id: internal species ID
        :param description: description for the method to determine the specificity
        :param remove_background: when true the lowest value of each profile is substracted from all values (can be
        off use with noisy data derived from microarrays.
        """

        conditions = []

        # get profile from the database (ORM free for speed)
        profiles = db.engine.execute(
            db.select([
                ExpressionProfile.__table__.c.id,
                ExpressionProfile.__table__.c.profile
            ]).where(ExpressionProfile.__table__.c.species_id ==
                     species_id)).fetchall()

        # detect all conditions
        for profile_id, profile in profiles:
            profile_data = json.loads(profile)
            for condition in profile_data['order']:
                if condition not in conditions:
                    conditions.append(condition)

        # convert list into dictionary and run function
        conditions_dict = {k: k for k in conditions}
        return ExpressionSpecificityMethod.calculate_tissue_specificities(
            species_id,
            description,
            conditions_dict,
            conditions,
            remove_background=remove_background)

    @staticmethod
    def calculate_tissue_specificities(species_id,
                                       description,
                                       condition_to_tissue,
                                       order,
                                       remove_background=False,
                                       use_max=True):
        """
        Function calculates tissue specific genes based on the expression conditions. A dict is required to link
        specific conditions to the correct tissues. This also allows conditions to be excluded in case they are
        unrelated with a specific tissue.


        :param species_id: internal species ID
        :param description: description for the method to determine the specificity
        :param condition_to_tissue: dict to connect a condition to a tissue
        :param order: preferred order of the conditions, will match tissues to it
        :param remove_background: substracts the lowest value to correct for background noise
        :param use_max: uses the maximum of mean values instead of the mean of all values
        :return id of the new method
        """
        new_method = ExpressionSpecificityMethod()
        new_method.species_id = species_id
        new_method.description = description
        new_method.menu_order = 0
        tissues = []
        for c in order:
            if c in condition_to_tissue.keys():
                v = condition_to_tissue[c]
                if v not in tissues:
                    tissues.append(v)

        # get profile from the database (ORM free for speed)
        profiles = db.engine.execute(
            db.select([
                ExpressionProfile.__table__.c.id,
                ExpressionProfile.__table__.c.profile
            ]).where(ExpressionProfile.__table__.c.species_id ==
                     species_id)).fetchall()

        new_method.conditions = json.dumps(tissues)

        db.session.add(new_method)
        db.session.commit()

        # detect specifities and add to the database
        specificities = []

        for profile_id, profile in profiles:
            # prepare profile data for calculation
            profile_data = json.loads(profile)
            profile_means = {}
            for t in tissues:
                values = []
                means = []
                valid_conditions = [
                    k for k in profile_data['data']
                    if k in condition_to_tissue and condition_to_tissue[k] == t
                ]
                for k, v in profile_data['data'].items():
                    if k in valid_conditions:
                        values += v
                        means.append(mean(v))

                if not use_max:
                    profile_means[t] = mean(values) if len(values) > 0 else 0
                else:
                    profile_means[t] = max(means) if len(means) > 0 else 0

            # substract minimum value to remove background
            # experimental code !
            if remove_background:
                minimum = min([v for k, v in profile_means.items()])

                for k in profile_means.keys():
                    profile_means[k] -= minimum

            # determine spm score for each condition
            profile_specificities = []
            profile_tau = tau([v for _, v in profile_means.items()])
            profile_entropy = entropy_from_values(
                [v for _, v in profile_means.items()])

            for t in tissues:
                score = expression_specificity(t, profile_means)
                new_specificity = {
                    'profile_id': profile_id,
                    'condition': t,
                    'score': score,
                    'entropy': profile_entropy,
                    'tau': profile_tau,
                    'method_id': new_method.id,
                }

                profile_specificities.append(new_specificity)

            # sort conditions and add top one
            profile_specificities = sorted(profile_specificities,
                                           key=lambda x: x['score'],
                                           reverse=True)

            specificities.append(profile_specificities[0])

            # write specificities to db if there are more than 400 (ORM free for speed)
            if len(specificities) > 400:
                db.engine.execute(ExpressionSpecificity.__table__.insert(),
                                  specificities)
                specificities = []

        # write remaining specificities to the db
        db.engine.execute(ExpressionSpecificity.__table__.insert(),
                          specificities)
        return new_method.id