Beispiel #1
0
def exclude_genes_in_network(genes_to_keep,
                             network_fullpath,
                             output_fullpath,
                             header=True):
    '''
    Filter genes in interaction network to only those in a specified gene list.
    This filtered network will be written to file. 
    Inputs:
        genes_to_keep: the output of extract_gene_names_from_textfile() which 
            is a list of gene names.
        output_fullpath: write the filtered network in a new textfile. 
        network_fullpath: textfile of interaction network containing two columns. 
            Each row of two genes represents an interaction event.
            File can contain header (True) or not. Default is true.
        header: is there a header in the network_fullpath? Default True. 
            If True, will also write header in output network.
    Outputs:
        Text file written to output_fullpath of a filtered network that includes
        only genes that we want to keep. 
    '''
    network = read_write_data.read_write(network_fullpath,
                                         output_fullpath,
                                         header=header)
    with network:
        # Write column names if header is true
        if header == True:
            network.writenext(network.inputcolnames)
        elif header == False:
            pass
        else:
            sys.argv('Header must be either True or False, %s given' % header)
        while True:
            try:
                readrow = network.readnext()
            except StopIteration:
                print('%s rows read, %s written, breaking loop...' \
                      %(network.readrowcount, network.writerowcount))
                break
            # If both genes in first and second column are in genes to keep,
            # then write row to file.
            if readrow[0] in genes_to_keep and readrow[1] in genes_to_keep:
                network.writenext(readrow)
    return None
def exclude_genes_in_network(genes_to_keep, network_fullpath, 
                             output_fullpath, header=True):
    '''
    Filter genes in interaction network to only those in a specified gene list.
    This filtered network will be written to file. 
    Inputs:
        genes_to_keep: the output of extract_gene_names_from_textfile() which 
            is a list of gene names.
        output_fullpath: write the filtered network in a new textfile. 
        network_fullpath: textfile of interaction network containing two columns. 
            Each row of two genes represents an interaction event.
            File can contain header (True) or not. Default is true.
        header: is there a header in the network_fullpath? Default True. 
            If True, will also write header in output network.
    Outputs:
        Text file written to output_fullpath of a filtered network that includes
        only genes that we want to keep. 
    '''
    network = read_write_data.read_write(network_fullpath, 
                                         output_fullpath, 
                                         header=header)
    with network:
        # Write column names if header is true
        if header == True:
            network.writenext(network.inputcolnames)
        elif header == False:
            pass
        else:
            sys.argv('Header must be either True or False, %s given' %header)
        while True:
            try:
                readrow = network.readnext()
            except StopIteration:
                print('%s rows read, %s written, breaking loop...' \
                      %(network.readrowcount, network.writerowcount))
                break
            # If both genes in first and second column are in genes to keep,
            # then write row to file. 
            if readrow[0] in genes_to_keep and readrow[1] in genes_to_keep:
                network.writenext(readrow)
    return None
def get_influence(influence_fullpath, row_genes, col_genes, output_fullpath):
    """
    Reads influence matrix and extracts rownames and column names specified
    by row_genes and col_genes respectively. Output is then written to file. 
    """
    influence = read_write_data.read_write(influence_fullpath, output_fullpath, header=True)
    unmapped_genes = []
    with influence:
        """
        Get column and row gene indices and rearrange column and row 
        genes according to indices.
        
        We will also likely have more row_genes and col_genes than we
        can map onto the interaction network. Therefore, we also need
        to filter the two gene lists. 
        
        Why? So we can reorder row and column genes in the same order
        as the interaction network.
        Why? So when we iterate by row, we will catch the row genes
        in order, without having to check all row genes every single 
        iteration. 
        """
        # Insert dummy variable X to beginning of column headers so that
        # the length of column names and subsequent rows are equal.
        influence.inputcolnames.insert(0, "X")

        col_indices = []
        col_genes_filtered = []
        for cgene in col_genes:
            try:
                col_indices.append(influence.inputcolnames.index(cgene))
                col_genes_filtered.append(cgene)
            except ValueError:
                unmapped_genes.append(cgene)
        row_indices = []
        row_genes_filtered = []
        for rgene in row_genes:
            try:
                row_indices.append(influence.inputcolnames.index(rgene))
                row_genes_filtered.append(rgene)
            except ValueError:
                unmapped_genes.append(rgene)

        col_genes = [cgene for (_, cgene) in sorted(zip(col_indices, col_genes_filtered))]
        col_indices = [i for (i, _) in sorted(zip(col_indices, col_genes_filtered))]
        row_genes = [rgene for (_, rgene) in sorted(zip(row_indices, row_genes_filtered))]
        row_indices = [i for (i, _) in sorted(zip(row_indices, row_genes_filtered))]

        # Write columns to file.
        col_genes_to_file = col_genes
        col_genes_to_file.insert(0, "X")
        influence.writenext(col_genes)

        """
        Iterate rows in interaction network and extract row genes and column genes
        """
        current_row = influence.readnext()
        current_gene = current_row[0]  # First element contains gene name.
        for rgene in row_genes:
            while current_gene != rgene:
                try:
                    current_row = influence.readnext()
                    current_gene = current_row[0]
                except StopIteration:
                    print("%s was not found in interaction network." % rgene)
                    print(
                        "StopIteration at %s rows read, %s written." % (influence.readrowcount, influence.writerowcount)
                    )
                    sys.exit()
            influence_on_cgenes = []
            for i in col_indices:
                influence_on_cgenes.append(current_row[i])
            influence.writenext([current_gene] + influence_on_cgenes)
        print("%s rows read, %s written." % (influence.readrowcount, influence.writerowcount))
def get_influence(influence_fullpath, row_genes, col_genes, output_fullpath):
    '''
    Reads influence matrix and extracts rownames and column names specified
    by row_genes and col_genes respectively. Output is then written to file. 
    '''
    influence = read_write_data.read_write(influence_fullpath,
                                           output_fullpath,
                                           header=True)
    unmapped_genes = []
    with influence:
        '''
        Get column and row gene indices and rearrange column and row 
        genes according to indices.
        
        We will also likely have more row_genes and col_genes than we
        can map onto the interaction network. Therefore, we also need
        to filter the two gene lists. 
        
        Why? So we can reorder row and column genes in the same order
        as the interaction network.
        Why? So when we iterate by row, we will catch the row genes
        in order, without having to check all row genes every single 
        iteration. 
        '''
        # Insert dummy variable X to beginning of column headers so that
        # the length of column names and subsequent rows are equal.
        influence.inputcolnames.insert(0, 'X')

        col_indices = []
        col_genes_filtered = []
        for cgene in col_genes:
            try:
                col_indices.append(influence.inputcolnames.index(cgene))
                col_genes_filtered.append(cgene)
            except ValueError:
                unmapped_genes.append(cgene)
        row_indices = []
        row_genes_filtered = []
        for rgene in row_genes:
            try:
                row_indices.append(influence.inputcolnames.index(rgene))
                row_genes_filtered.append(rgene)
            except ValueError:
                unmapped_genes.append(rgene)

        col_genes = [
            cgene
            for (_, cgene) in sorted(zip(col_indices, col_genes_filtered))
        ]
        col_indices = [
            i for (i, _) in sorted(zip(col_indices, col_genes_filtered))
        ]
        row_genes = [
            rgene
            for (_, rgene) in sorted(zip(row_indices, row_genes_filtered))
        ]
        row_indices = [
            i for (i, _) in sorted(zip(row_indices, row_genes_filtered))
        ]

        # Write columns to file.
        col_genes_to_file = col_genes
        col_genes_to_file.insert(0, 'X')
        influence.writenext(col_genes)
        '''
        Iterate rows in interaction network and extract row genes and column genes
        '''
        current_row = influence.readnext()
        current_gene = current_row[0]  # First element contains gene name.
        for rgene in row_genes:
            while current_gene != rgene:
                try:
                    current_row = influence.readnext()
                    current_gene = current_row[0]
                except StopIteration:
                    print('%s was not found in interaction network.' % rgene)
                    print('StopIteration at %s rows read, %s written.' \
                          %(influence.readrowcount, influence.writerowcount))
                    sys.exit()
            influence_on_cgenes = []
            for i in col_indices:
                influence_on_cgenes.append(current_row[i])
            influence.writenext([current_gene] + influence_on_cgenes)
        print('%s rows read, %s written.' %
              (influence.readrowcount, influence.writerowcount))