Example #1
0
 def generate_dataframe(self, source_data, network_config ):
     (data_file, annotations_file, agilent_file, synonym_file, network_table, 
             source_id) =  (
      source_data[ 'data_file'], source_data[ 'annotations_file'],
      source_data[ 'agilent_file'],source_data[ 'synonym_file'],
      network_config[ 'network_table'], network_config[ 'network_source'])
     (data_file, annotations_file, agilent_file, synonym_file) = self.strip_s3_path(
      (data_file, annotations_file, agilent_file, synonym_file) )
     self.logger.debug("Getting base data table")
     data_orig = self._get_data( data_file, annotations_file )
     self.logger.debug("Mapping probes to genes")
     probe_map = self._get_probe_mapping( agilent_file )
     self.logger.debug("Getting genes in given networks")
     network_genes = self._get_network_genes( network_table, source_id )
     self.logger.debug("Adding synonyms")
     syn_map = self._get_synonyms( probe_map, network_genes, synonym_file )
     self.logger.debug("Mapping probes to genes in networks")
     ng2pm = self._get_probe_to_gene_map( probe_map, syn_map )
     self.logger.info("Creating DataFrame")
     new_df = DataFrame(np.zeros( (len(ng2pm), len(data_orig.columns))),
         index=ng2pm.keys(), columns=data_orig.columns )
     #map each network gene to the median of synonymous probes
     self.logger.debug("Aggregating multiple probes to single genes"
             " as median values" )
     for k, probes in ng2pm.iteritems():
         new_df.ix[k] = data_orig.ix[probes].median()
     nm_size = self._estimate_net_map( ng2pm )
     return new_df, nm_size
Example #2
0
 def generate_dataframe(self, data_file, annotations_file, 
         agilent_file, synonym_file, network_table, source_id ):
     data_orig = self._get_data( data_file, annotations_file )
     probe_map = self._get_probe_mapping( agilent_file )
     network_genes = self._get_network_genes( network_table, source_id )
     syn_map = self._get_synonyms( probe_map, network_genes, synonym_file )
     ng2pm = self._get_probe_to_gene_map( probe_map, syn_map )
     new_df = DataFrame(np.zeros( (len(ng2pm), len(data_orig.columns))),
         index=ng2pm.keys(), columns=data_orig.columns )
     #map each network gene to the median of synonymous probes
     test = True
     for k, probes in ng2pm.iteritems():
         new_df.ix[k] = data_orig.ix[probes].median()
     return new_df
Example #3
0
def mapNewData(working_bucket, data, meta_data, anno_data,syn_file,agilent_file,network_table):
    """
    Given local file locations for source data, meta data, annotations data,
        synonyms file and the agilent (probe->gene) file,
    Creates a new dataframe, containing only gene information for genes
        present in the network table, indexed by gene name, columns are sample ids
    Returns dataframe pickle file location and dataframe
    """
    anno = pandas.io.parsers.read_table(anno_data)
    data = pandas.io.parsers.read_table(data)
    metadata = pandas.io.parsers.read_table(meta_data)
    agl = pandas.io.parsers.read_table(agilent_file)
    
    #get rid of control probes

    data.index = anno['ProbeName']
    control_probe_names = anno['ProbeName'][anno['ControlType'] != 0]
    data = data.drop(control_probe_names)

    agl.set_index('ProbeID')
    agl2 = agl[agl['GeneSymbol'].notnull()]
    agl2 = agl2.set_index('ProbeID')

    #map probes to genes from network

    a = agl2['GeneSymbol'].tolist()
    b = set(a)
    table = Table(network_table)
    temp_nets = table.scan()
    network_genes = []
    i = 0
    for net in temp_nets:
        network_genes += net['gene_ids'][6:].split('~:~')
    network_genes_set = set(network_genes)


    mm = {}
    added = []
    with open(syn_file,'r') as synonyms:
        for line in synonyms:
            parsed = line.split()
            try:
                temp = []
                for p in parsed[:5]:
                    tt = p.split('|')
                    for t in tt:
                        if len(t) > 2 and t in network_genes_set and parsed[2] in b:
                            added.append(t)
                            temp.append(t)
                if len(temp) > 0:
                    if parsed[2] not in mm:
                      mm[parsed[2]] = []
                    for t in temp:
                        if t not in mm[parsed[2]]:
                            mm[parsed[2]].append(t)
                
            except IndexError:
                pass
    ng2p = {}
    probes = []
    with open(agilent_file, 'r') as gl:
        for line in gl:
            parsed = line.split()
            try:
                if parsed[2] in mm: #mouse gene is mapped to network gene
                    for ng in mm[parsed[2]]:
                        if ng not in ng2p:
                            ng2p[ng] = []
                        if parsed[0] not in ng2p[ng]:
                            ng2p[ng].append(parsed[0])
                            probes.append(parsed[0])          
            except IndexError:
                pass
    #create newly trimmed and annotated data frame
    #save pickle locally

    df = DataFrame(np.zeros((len(ng2p), len(data.columns))), index=ng2p.keys(), columns=data.columns)
    for k,v in ng2p.iteritems():
        df.ix[k] = data.ix[v].median()
    saved = os.path.join(os.path.split(agilent_file)[0],'trimmed_dataframe.pandas')
    df.save(saved)
    
    #send pickled dataframe to working bucket
    conn = boto.connect_s3()
    b = conn.get_bucket(working_bucket)
    k=Key(b)
    k.key = 'trimmed_dataframe.pandas'
    k.storage_class = 'REDUCED_REDUNDANCY'
    k.set_contents_from_filename(saved)

    k.key = 'metadata.txt'
    k.storage_class = 'REDUCED_REDUNDANCY'
    k.set_contents_from_filename(meta_data)

    return saved,df