def export_sample_info(self):
        print('get sample_raw and raw_sample')
        if os.path.isfile(self.par['file_sample_info']):
            self.file_to_samples()
        else:
            self.raw_to_samples()
            #generate sample file
            self.sample_info()
        #
        print("\nSample and raw files:")
        myDict.basic(self.sample_raw).print_dict()
        self.par['raw_to_sample'] = self.raw_sample # one raw file vs one sample name
        self.par['sample_to_raw'] = self.sample_raw #one sample name vs a list of raw files
        #get sample names
        self.sample_names = sorted(self.sample_raw.keys())
        self.par['sample_names'] = self.sample_names
        #get sample_dirs
        self.sample_storage()
        self.par['sample_dirs'] = self.sample_dirs

        print('get group names if exists')
        flag = 1
        while flag > 0:
            group_samples, sample_groups=self.group_names(flag+2)
            if group_samples == {}:
                flag = 0
            else:
                key = 'group' + str(flag)
                self.par[key] = group_samples
                flag += 1
                print('Groups of {}: {}'.format(key,self.par[key].keys()))
        return self.par
Beispiel #2
0
 def protein_peptides(self):
     pro_pep = {}
     #read annotation file
     annot_dict = myIO.file_os(self.par['file_annotation'], "\t").to_dict2()
     if 'Rnl2_SPIKEIN' in annot_dict: 
         annot_dict['Rnl2_SPIKEIN']['pep_rank'] = 0
     in_pro = [annot_dict[p]['pro_id'] for p in annot_dict.keys()]
     in_pro = list(set(in_pro))
     print('In proteins:{}, In peptides:{}'.format(in_pro.__len__(), annot_dict.keys().__len__()))
         
     ##
     pro_rank_pep = {}
     for pep_id in self.par['pep_ids']:
         pro_id = annot_dict[pep_id]['pro_id']
         pep_rank = annot_dict[pep_id]['pep_rank']
         pep_rank = int(pep_rank) if isinstance(pep_rank, int) else 0
         if pro_id in pro_rank_pep:
             pro_rank_pep[pro_id][pep_id] = pep_rank
         else:
             pro_rank_pep[pro_id] = {pep_id:pep_rank}
             #print pro_rank_pep[pro_id]
     #
     pep_num = 0
     for pro_id, pep_dict in pro_rank_pep.items():
         #print sorted(pep_dict.keys())
         peps = sorted(pro_rank_pep[pro_id], key = pro_rank_pep[pro_id].get)
         pep_num += len(peps)
         pro_pep[pro_id] = ','.join(peps)
     #export
     print("Number of protein:{}\tNumber of peptides:{}.".format(len(pro_pep.keys()), pep_num))
     myDict.basic(pro_pep, self.par['pro_ids']).dict_to_file(self.par['file_pro_pep'], "\t")
     #
     return pro_pep
Beispiel #3
0
    def sig_polyclonal(self, count_file):
        #count_file = args_tuple
        print("Polyclonal analysis of ", count_file)
        comb_df, pep_df = myCommon.basic(self.par).combine_df(count_file)

        #functions
        def hits_func(x, peps, threshold, pro_id):
            #signficant hits
            hits = x[x >= threshold]
            #non_overlapping peptides
            peps = [str(x) for x in peps]
            hit_peps = [str(x) for x in hits.index]
            none_overlapped_hits_num = myList.basic(peps).un_neighbours(
                hit_peps, return_type='hits_num')
            #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num)
            #if len(hit_peps)>0: print pro_id, peps, hit_peps
            #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps
            return len(list(
                hits.index)), none_overlapped_hits_num, ','.join(hit_peps)

        #collapse by protein
        hits1 = {}
        hits2 = {}
        #n = 1
        for pro_id, row_index in comb_df.groupby('pro_id').groups.items():
            #row is protein id
            ##get protein-peptides annotations
            peps_str = self.par['dict_pro_pep'][pro_id]
            peps = peps_str.split(',')
            #df by protein
            sub_df = pep_df.ix[row_index]
            #print("{}\t{}".format(pro_id, list(sub_df.index)) )
            #hits num beyond zscore threshold
            hits_num = sub_df.apply(hits_func,
                                    axis=0,
                                    args=(peps, self.par['zscore_threshold'],
                                          pro_id))
            #if pro_id == 'Q9YLJ1': print hits_num
            #all number of significant hits
            num1 = [h[0] for h in hits_num]
            hits1[pro_id] = dict(zip(list(sub_df), list(num1)))
            #number of sig hits without overlapping
            num2 = [h[1] for h in hits_num]
            hits2[pro_id] = dict(zip(list(sub_df), list(num2)))
            #if (np.sum(num1))>10:
            #pd.set_option('display.max_columns', None)
            #pd.set_option('display.max_rows', None)
            #print np.matrix(np.round(sub_df))
            #print num1
            #print num2
            #n+ = 1
            #if n == 10: break

        #export
        file_head = myIO.file_os(count_file).file_prefix() + '_polyclonal'
        myDict.basic(hits1, self.par['pro_ids']).dict2_to_file(
            file_head + '.txt', "\t")
        myDict.basic(hits2, self.par['pro_ids']).dict2_to_file(
            file_head + '_nonoverlapped.txt', "\t")
Beispiel #4
0
 def line_replace(self, new_dict={}):
     #get old variables
     var_dict = self.to_dict()
     #refresh new
     for name in new_dict.keys():
         var_dict[name] = new_dict[name]
     #export to file
     myDict.basic(var_dict).dict_to_file(self.file, self.sep)
 def sample_info(self):
     sample_pairs = {}
     for raw_file, sample_name in self.raw_sample.items():
         raw_file_name = myIO.file_os(raw_file).file_name()
         group = 'NC' if 'BEADS' in raw_file_name.upper() else 'PhIP'
         if not 'unassigned' in raw_file_name:
             sample_name = re.sub('_R1', "", sample_name)
             pair = '{},{}'.format(raw_file_name, sample_name)
             sample_pairs[pair]=group
     #export dict to file
     print('Generate sample file: ', self.par['file_sample_info'])
     #order per record: fastq file name, sample_name, phip_group
     myDict.basic(sample_pairs).dict_to_file(self.par['file_sample_info'], ',')
Beispiel #6
0
    def line_add(self, new_dict={}):
        #get old variables
        var_dict = self.to_dict()
        #refresh new
        for name in new_dict.keys():
            if name in var_dict:
                var_dict[name] = int(var_dict[name]) + int(new_dict[name])
            else:
                var_dict[name] = new_dict[name]
        #export to file
        myDict.basic(var_dict).dict_to_file(self.file, self.sep)


##
#end
Beispiel #7
0
 def permute_col(self, times=2, slice_dict=None):
     #add shuffled dict into embed_dict
     embed_dict = {}
     for i in range(times):
         #1: shuffle data frame
         shuffled_df = self.df.iloc[np.random.permutation(len(self.df))].copy()
         shuffled_df.index = self.df.index
          #print shuffled_df
         #2: convert permuted dfs into embeded dataframe
         shuffled_dict=shuffled_df.to_dict()
         embed_dict=myDict.basic(embed_dict).combine_dupdict2(shuffled_dict, i)
         
     #convert to dataframe
     if slice_dict is None:
         permute = embed_dict # col-name is key1, row-name is key2
     else:
         permute = {}
         embed_df = pd.DataFrame(embed_dict)
         for slice_name, row_indexs in slice_dict.iteritems():
             permute[slice_name] = {}
             #print slice_name
             #
             sub_df = embed_df.ix[row_indexs]
             for col_name, col in sub_df.iteritems():
                 permute[slice_name][col_name] = col #col is pd.Series
                 #print col
                 #print type(col)
                 #break
             
     #print permute
     return(permute)        
Beispiel #8
0
 def combine_countfiles(self, args_tuple):
     #row_names should be None or list type
     infile_tail, RC_level, out_file, row_names = args_tuple
     #
     counting_dict2 = {}
     for sample_name in self.par['sample_names']:
         #get read counts of a given sample
         counting_file = '{}{}/{}{}'.format(self.par['dir_result'], sample_name, sample_name, infile_tail)
         sample_dict2 = myIO.file_os(counting_file, '\t').to_dict2()
         for ref in sample_dict2.keys():
             #print ref
             counts = sample_dict2[ref][RC_level]
             if ref in counting_dict2:
                 counting_dict2[ref].update({sample_name:counts})
                 #print '=='+ref+'=='
             else:
                 counting_dict2[ref] = {sample_name:counts}
             #print sample_name, ref,counting_dict2[ref]
     #export counting_dict
     myDict.basic(counting_dict2).dict2_to_file(out_file=out_file, row_names=row_names)
Beispiel #9
0
 def hits_permutation1(self, in_dict, sample_size=10):
     #get the pool for sampling
     pool = in_dict.keys()
     #
     permute_dict = {}
     for i in range(self.par['permutation_times']):
         #random select some keys from the pool
         random_keys = random.sample(pool, sample_size)
         random_values = {}
         for k in random_keys:
             values_list = in_dict[k].split(',')
             for v in values_list:
                 if v in random_values:
                     random_values[v] += 1
                 else:
                     random_values[v] = 1
         #
         permute_dict[i] = random_values
     #transform dict: times in columns, value of in_dict is in rows
     permute_dict = myDict.basic(permute_dict).transform_dict2()
     return permute_dict
Beispiel #10
0
    def enrich_pro(self, infile, annot_A, annot_B, sep1, sep2):
        if annot_A is None: annot_A = 'transcript_id'
        if annot_B is None: annot_B = 'pro_motifs'
        print("Enrichment analysis of {} => {} : {}".format(
            annot_A, annot_B, infile))
        #read data frame
        file_sep = ',' if infile.endswith('.csv') else '\t'
        counts_df = pd.read_csv(infile,
                                index_col=0,
                                sep=file_sep,
                                low_memory=False)
        #get all ids connect counts_df with annot_df
        A_ids = list(self.par['annot_df'][annot_A])
        #get all ids based on annot_type in list formate
        B_ids = myDataframe.basic(self.par['annot_df']).df_list(
            annot_B, sep1, sep2)
        #get A_ids vs list of b_ids in dict formate
        AB_dict = myDataframe.basic(self.par['annot_df']).list_dict(
            annot_A, annot_B, sep1, sep2)

        #initiate: #frequency of observed enriched motifs
        hits_observed = myDict.basic().init_dict2(B_ids, list(counts_df), 0)
        #initiate: zscores of obs based on permutation models
        hits_zscores = myDict.basic().init_dict2(B_ids, list(counts_df), 0)
        #initiate: detect bugs
        debugging = myDict.basic().init_dict2(
            B_ids + ['hits_counts', 'interact_counts'], {}, 'NA')
        #loop of data frame by columns
        for sample_name, zscores in counts_df.items():
            #print sample_name
            zscores = pd.Series(zscores)
            zscores.index = list(counts_df.index)
            #1: get ids of significant hits
            sig_zscores = zscores[zscores >= self.par['zscore_threshold']]
            obs_ids = list(sig_zscores.index)
            sig_num = len(obs_ids)
            #print annot_B, sample_name,sig_num
            #2: count frequency of enriched annotations, namely motifs
            obs_freq, obs_details = myDict.basic(AB_dict).elements_frequency(
                obs_ids)
            #print obs_freq.values()
            #debugging
            debugging['hits_counts'][sample_name] = sig_num
            debugging['interact_counts'][sample_name] = sum(obs_freq.values())

            #3: permute samples
            #print "\tenrichment: %s\t%s\t%s" % (sample_name, sig_num, len(obs_freq.keys()))
            perm_dict = {}
            for i in range(self.par['permutation_times']):
                perm_peps = random.sample(A_ids, sig_num)
                tmp_perm, tmp_details = myDict.basic(
                    AB_dict).elements_frequency(perm_peps)  # frequency dict
                for key, value in tmp_perm.items():
                    if key in perm_dict:
                        perm_dict[key].append(value)
                    else:
                        perm_dict[key] = [value]
            #print perm_dict

            #4: calcuate z-scores of observed counts
            for enriched_id, obs_num in obs_freq.items():
                #update hit_observed
                hits_observed[enriched_id][
                    sample_name] = obs_num  #frequency of observed enriched annot
                #update debugging
                debugging[enriched_id][sample_name] = '{}:{}'.format(
                    obs_num, obs_details[enriched_id])
                #update zscores_dict
                if enriched_id in perm_dict:
                    perm_pools = perm_dict[enriched_id]
                    #append zero and all pools are the same length
                    perm_pools = perm_pools + [0] * (5 - len(perm_pools))
                    perm_mean = np.mean(perm_pools)
                    perm_sd = np.std(perm_pools)
                    #zscores of observed hits against the null model
                    zscore = (obs_num -
                              perm_mean) / perm_sd if perm_sd > 0 else (
                                  obs_num - perm_mean)
                    hits_zscores[enriched_id][sample_name] = round(zscore, 2)
                else:
                    hits_zscores[enriched_id][sample_name] = obs_num
            #print hits_zscores

        #export
        file_head = '{}{}_{}_'.format(self.par['dir_enrichment'],
                                      myIO.file_os(infile).name_prefix(),
                                      annot_B)
        myDict.basic(hits_observed).dict2_to_file(out_file=file_head +
                                                  'counting.txt',
                                                  index_label=annot_B)
        myDict.basic(hits_zscores).dict2_to_file(out_file=file_head +
                                                 'zscores.txt',
                                                 index_label=annot_B)
        myDict.basic(debugging).dict2_to_file(out_file=file_head +
                                              'debugging.txt',
                                              index_label=annot_B,
                                              NA='NA')
    def init_dir_file(self):
        self.par['dir_home'] = myIO.dir_os(self.par['dir_home']).create_dir()
        print('home directory of phip tool:', self.par['dir_home'])
        #dir_home = /home/yuan/phip/

        #alignment related
        self.par['dir_aligner'] = self.par['dir_home'] + 'bowtie1/'
        self.par['aligner_options'] = '{}bowtie {}'.format(
            self.par['dir_aligner'], self.par['aligner_options'])
        self.par['genome_index'] = self.par['dir_aligner'] + self.par[
            'genome_index_name']
        self.par['dir_ref_seq'] = self.par['dir_home'] + 'ref_seq/'
        self.par['file_ref_fa'] = '{}{}.fa'.format(
            self.par['dir_ref_seq'], self.par['genome_index_name'])
        if 'file_annotation' in self.par.keys():
            self.par['file_annotation'] = self.par['dir_ref_seq'] + self.par[
                'file_annotation']
        #
        #judge ref library human or virus
        if 'VirScan' in self.par['genome_index_name']:
            self.par['lib'] = 'virus'
            self.par[
                'file_NC'] = self.par['dir_ref_seq'] + 'virus_BeadsOnly.txt'
        elif 'human' in self.par['genome_index_name']:
            self.par['lib'] = 'human'
            self.par[
                'file_NC'] = self.par['dir_ref_seq'] + 'human_BeadsOnly.txt'
        elif 'PublicEpitope' in self.par['genome_index_name']:
            self.par['lib'] = 'PE'
        elif 'LISH' in self.par['genome_index_name']:
            self.par['lib'] = 'LISH'

        #dir of raw data
        if 'dir_raw_data' not in self.par.keys():
            self.par['dir_raw_data'] = myIO.dir_os(self.par['dir_home'] +
                                                   'raw_data').create_dir()
        #results related
        if 'dir_result' not in self.par.keys():
            self.par['dir_result'] = myIO.dir_os(self.par['dir_home'] +
                                                 'result').create_dir()
        #print('Result directory', self.par['dir_result'])
        if 'dir_result_array' not in self.par.keys():
            self.par['dir_result_array'] = self.par['dir_result']

        #dir of statistics
        self.par['dir_stat'] = myIO.dir_os(self.par['dir_result'] +
                                           'statistics').create_dir()
        self.par['dir_QC'] = myIO.dir_os(self.par['dir_stat'] +
                                         'QC').create_dir()
        self.par['dir_enrichment'] = myIO.dir_os(self.par['dir_stat'] +
                                                 'enrichment').create_dir()

        #sample info
        self.par[
            'file_sample_info'] = self.par['dir_result'] + 'sample_info.csv'
        self.par['dir_log'] = self.par['dir_result'] + 'sample_log/'
        self.par['file_log'] = self.par['dir_result'] + 'output.log'
        self.par['file_total_log'] = self.par['dir_result'] + 'Total.log'
        self.par['file_stat'] = self.par['dir_QC'] + 'statistics.csv'
        self.par['file_ref_txt'] = self.par['dir_result'] + 'references.txt'
        self.par[
            'file_pro_pep'] = self.par['dir_result'] + 'protein_peptides.txt'
        #raw data related
        #print(self.par['dir_raw_data'])
        #
        self.par['RC_levels'] = ['lowRC']  #lowRC, midRC, highRC
        self.par['phip_levels'] = ['pep', 'promax', 'prosum']
        files_dict = {}
        for pl in self.par['phip_levels']:
            file_head = '{}{}_'.format(self.par['dir_stat'], pl)
            #raw reads
            files_dict[pl + '_RC'] = file_head + 'RC.txt'
            #noramlized by total raw counts
            files_dict[pl + '_scalingRC'] = file_head + 'scalingRC.txt'
            files_dict[
                pl + '_scalingRC_prosum'] = file_head + 'scalingRC_prosum.txt'
            files_dict[
                pl + '_scalingRC_promax'] = file_head + 'scalingRC_promax.txt'
            #scalingRC against regressed median of phip sample and regressed sd of negative controls
            files_dict[pl + '_NCPHIPzscores'] = file_head + 'NCPHIPzscores.txt'
            files_dict[
                pl +
                '_NCPHIPzscores_prosum'] = file_head + 'NCPHIPzscores_prosum.txt'
            files_dict[
                pl +
                '_NCPHIPzscores_promax'] = file_head + 'NCPHIPzscores_promax.txt'
        self.par['files_dict'] = files_dict

        #default parameters
        self.par['specieZ_threshold'] = int(
            self.par['specieZ_threshold']
        ) if 'specieZ_threshold' in self.par.keys() else 10
        self.par['align_score'] = float(
            self.par['align_score']) if 'align_score' in self.par.keys(
            ) else 80
        #p value cutoff for binomial testing
        self.par['p_threshold'] = float(
            self.par['p_threshold']) if 'p_threshold' in self.par.keys(
            ) else .001
        #x value is observed successes cutoff for binomial test
        self.par['x_threshold'] = float(
            self.par['x_threshold']) if 'x_threshold' in self.par.keys() else 1
        self.par['sim_threshold'] = float(
            self.par['sim_threshold']) if 'sim_threshold' in self.par.keys(
            ) else 0.8
        self.par['zscore_threshold'] = int(
            self.par['zscore_threshold']
        ) if 'zscore_threshold' in self.par.keys() else 10
        self.par['permutation_times'] = int(
            self.par['permutation_times']
        ) if 'permutation_times' in self.par.keys() else 100
        self.par['threads_num'] = int(self.par['threads_num'])
        self.par['scaling_factor'] = int(
            self.par['scaling_factor']) if 'scaling_factor' in self.par.keys(
            ) else 1e6

        #print self.par
        myDict.basic(self.par).print_dict()
        #
        return (self.par)
Beispiel #12
0
def par_command(argv):
    phip_libs = ['human', 'virus', 'PE', 'allergome', 'LISH']
    #initiate parameters
    par = {'fq_file':'NA','barcode_file':'NA','index_file':'NA','I1_file':'NA','I2_file':'NA', \
        'dir_raw_data':'NA', 'dir_raw':'NA','dir_in':'NA', 'out':'NA', \
        'dir_result':'NA', 'multiplexing_mode':0, 'ref_libs':phip_libs[:2], \
        'seq_start':0, 'seq_end':0, 'seq_min':10, 'seq_max':0 }
    usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \
                '-f <fastq file> -i <index file> -b <barcode file>\n'
    try:
        opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:l:x:y:m:n:z:c:",["help",\
                "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_len",\
                'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','dir_raw','ref_library'])
    except getopt.GetoptError:
        print(usage_out)
        sys.exit(2)

    #get parameters
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print(usage_out)
            #common usage
            # python Process_FASTQ.py -f * -i * -b * -o * -y *"
            print("-h --help\tUsage information of this script.")
            print(
                "-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)"
            )
            print(
                "-f --fastq_file\tFastq file determined by a sequencing analyzer."
            )
            print("-i --index_file\tIndex file matched with the fastq file.")
            print(
                "-b --barcode_file\tBarcode file matched with the index file.")
            print(
                "-o --raw_data\tDirectory storing demulitplexed *fastq files.")
            print(
                "-y --out\tDirectory storing sample_info.csv and variables.txt."
            )
            print(
                "-c --ref_library\tReference libraries can be one of {}, default is {}."
                .format(phip_libs, phip_libs[:2]))
            sys.exit(2)
        elif opt in ("-f", "--fastq_file"):
            par['fq_file'] = os.path.abspath(arg)
            par['multiplexing_mode'] += 1
        elif opt in ("-i", "--index_file"):
            par['index_file'] = os.path.abspath(arg)
            par['multiplexing_mode'] += 1
        elif opt in ("-b", "--barcode_file"):
            par['barcode_file'] = os.path.abspath(arg)
            par['multiplexing_mode'] += 1
        elif opt in ("-o", "--raw_data"):
            par['dir_raw_data'] = myIO.dir_os(
                os.path.abspath(arg)).create_dir()
        elif opt in (
                "-z",
                "--all_raw_data"):  # only for one more sets of fastq splits
            par['dir_raw'] = myIO.dir_os(os.path.abspath(arg)).create_dir()
        elif opt in ('-x', "--dir_in"):
            par['dir_in'] = os.path.abspath(arg)
            par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in'])
        elif opt in ('-y', "--out"):
            par['out'] = arg
        elif opt in ("-l", "--fixed_len"):
            len_min, len_max = arg.split(':')
            par['seq_min'] = abs(int(len_min))
            par['seq_max'] = abs(int(len_max))
        elif opt in ("-t", "--trim_len"):
            trim_end5, trim_end3 = arg.split(':')
            par['seq_start'] = abs(int(trim_end5))
            par['seq_end'] = -abs(int(trim_end3))
        elif opt in ("-m", "--I1_file"):
            par['I1_file'] = os.path.abspath(arg)
        elif opt in ("-n", "--I2_file"):
            par['I2_file'] = os.path.abspath(arg)
        elif opt in ("-c", "--ref_library"):
            libs = arg.split(',')
            par['ref_libs'] = [x for x in libs if x in phip_libs]
    #
    if par['seq_max'] > 0:
        par['seq_end'] = par['seq_max']
    #
    myDict.basic(par).print_dict()
    return par
Beispiel #13
0
    def taxon_spec(self, count_file, taxon_rank, annot_index):
        #combine two data frame
        combined_df, phip_df = myCommon.basic(self.par).combine_df(
            count_file, annot_index)
        #print(combined_df)
        #print(list(combined_df.index))

        #taxonomy names:
        taxon_group = combined_df.groupby(taxon_rank).groups
        taxon_names = taxon_group.keys()
        taxon_names = [t for t in taxon_names if str(t) != 'nan']  #remove nan
        #print(taxon_names)
        taxon_pairs = {'taxon_specie':'InterSpecie', 'taxon_genus':'InterGenus', \
                     'taxon_family':'InterFamily', 'taxon_phip':'InterTaxon'}
        taxon_inter = taxon_pairs[taxon_rank]

        #inter-score dict
        #taxon_inter should be pep_ids separated by comma
        pepid_taxoninter = pd.Series(combined_df[taxon_inter],
                                     index=list(phip_df.index))
        inter_df = myDataframe.basic(phip_df).interact_df(
            pepid_taxoninter, max, count_file + taxon_inter)

        #make permutation of pep_ids
        #permute_dict = myList.basic(list(phip_df.index)).permute_Series(self.par['permutation_times'], slice_dict = taxon_group)

        #the hits of significant specie specific
        #rows are peptides, and columns are phip samples plus species names
        #z-scores matrix of specific peptides
        #initiate nested dict
        taxon_dict = dict([(s, {}) for s in list(phip_df)])  # number of hits
        taxon_dict['peptides'] = dict([(a, len(b))
                                       for a, b in taxon_group.items()])
        #taxon_pval_dict = dict([(s,{}) for s in list(phip_df)]) #pvalues of the hits by permutations
        taxon_pep_dict = dict([(s, {}) for s in list(phip_df)
                               ])  #pepid and zscores of hits
        debugging_dict = {}  #for identify bugs
        for s in list(phip_df):
            debugging_dict[s + ':all_hits'] = {}
            debugging_dict[s + ':inter_hits'] = {}
            debugging_dict[s + ':intra_hits'] = {}
            debugging_dict[s + ':hits'] = {}
            debugging_dict[s + ':counts'] = {}
            #debugging_dict[s+':pvals'] = {}
        #loop by sample_names
        for sample_name, col in phip_df.items():
            #print(sample_name)
            for s, indexs in taxon_group.items():
                #1: inter-taxon searching
                inter_list = inter_df.ix[indexs][sample_name]
                inter_dict = self.taxon_inter_searching(
                    col[indexs], inter_list)
                #export
                debugging_dict[sample_name +
                               ':all_hits'][s] = inter_dict['all_hits']
                debugging_dict[sample_name +
                               ':inter_hits'][s] = inter_dict['inter_hits']
                #print(inter_dict)

                #2: intra-taxon searching
                intra_dict = self.taxon_intra_searching(
                    col[inter_dict['other_hits']])
                #export
                debugging_dict[sample_name +
                               ':intra_hits'][s] = intra_dict['intra_hits']
                debugging_dict[sample_name + ':hits'][s] = intra_dict['hits']
                all_hits = [
                    '{}:{}'.format('all', len(inter_dict['all_hits'])),
                    '{}:{}'.format('inter', len(inter_dict['inter_hits'])),
                    '{}:{}'.format('intra', len(intra_dict['intra_hits'])),
                    '{}:{}'.format('hits', len(intra_dict['hits']))
                ]
                debugging_dict[sample_name + ':counts'][s] = ','.join(all_hits)
                hit_list = [
                    '({},{})'.format(a, b)
                    for a, b in col[intra_dict['hits']].items()
                ]
                taxon_pep_dict[sample_name][s] = ','.join(hit_list)
                #counts matrix of taxonomy search
                taxon_dict[sample_name][s] = len(intra_dict['hits'])

                #3: permutation
                #hit_scores = col[intra_dict['hits']]
                #permuted_scores = permute_dict[s]#df, pepids in rows, permuted scores in columns
                #pval_dict = self.taxon_permutation(hit_scores, permuted_scores, col)
                #export
                #pval_list = [len(intra_dict['hits']), pval_dict['ttest_pval'], pval_dict['utest_pval']]
                #taxon_pval_dict[sample_name][s] = ','.join(map(str, pval_list))
                #pval_list = [ a+':'+str(b) for a,b in pval_dict.items()]
                #debugging_dict[sample_name+':pvals'][s] = ','.join(pval_list)
        #export to file
        file_head = '{}_{}_'.format(
            myIO.file_os(count_file).file_prefix(), taxon_rank)
        taxon_dict = myDict.basic(taxon_dict).transform_dict2()
        myDict.basic(taxon_dict).dict2_to_file(file_head + 'counting.txt',
                                               "\t")
        taxon_pep_dict = myDict.basic(taxon_pep_dict).transform_dict2()
        myDict.basic(taxon_pep_dict).dict2_to_file(file_head + 'peptides.txt',
                                                   "\t")
        debugging_dict = myDict.basic(debugging_dict).transform_dict2()
        myDict.basic(debugging_dict).dict2_to_file(file_head + 'debugging.txt',
                                                   "\t")
Beispiel #14
0
 def count_reads(self):
     #key is ref name, value is reads string sep by comma, the first is ref seq
     unique_seq = dict((a, []) for a in self.par['ref_dict'].keys()) 
     #unique and multiple counts in dict
     unique = {} #key is ref name, value is counts
     multiple = {} # key is query name, value is the list of refs
     num = {}# counts statistics
     saturation = {0:{1:0, 5:0, 10:0, 'max':0 }} # count number for saturation analysis
     last_index = 0
     
     print('\tread sam file: {}.gz'.format(self.par['sample_sam_file']))
     IN = gzip.open(self.par['sample_sam_file']+'.gz', 'rt')
     UN = gzip.open(self.par['sample_dir']+self.par['sample_name']+'_unknown.fa.gz', 'wt')
     maxRC = 0
     for line in IN:
         #print(line)
         #counts
         num['raw_reads_num'] = num.setdefault('raw_reads_num',0)+1
         #analyze sam line
         info = self.analyze_SAM(line)
         qname, ref= info['qname'], info['ref']
         #unique alignment
         if info['aligned'] == '1':
             unique[ref] = unique.setdefault(ref,0) + 1
             if unique[ref] > maxRC: maxRC = unique[ref]
             #counting of saturation
             if unique[ref] in [1,5,10]:
                 last_counts = saturation[last_index].copy()# copy() is essential!!!!!
                 last_counts[unique[ref]] += 1
                 last_counts['max'] = maxRC#the maximum RC at the time of raw reads we get
                 saturation[num['raw_reads_num']] = last_counts
                 #print num['raw_reads_num'], last_index, saturation[num['raw_reads_num']]
                 last_index = num['raw_reads_num']
             #export aligned sequences of reads
             unique_seq[ref].append(info['seq'])
             num['unique_aligned_reads_num'] = num.setdefault('unique_aligned_reads_num',0)+1
         #multiple alignment
         elif info['aligned'] == '3':
             multiple[qname] = multiple[qname] + [ref] if qname in multiple else [ref]
             num['multialigned_reads_num'] = num.setdefault('multialigned_reads_num',0)+1
         #unalignment
         else:
             UN.write('>'+qname+'\n'+info['seq']+'\n')
             num['unaligned_reads_num'] = num.setdefault('unaligned_reads_num',0) + 1
     IN.close()
     UN.close()
     #counting of saturation
     if num['raw_reads_num'] > last_index:
         saturation[num['raw_reads_num']] = saturation[last_index].copy()
     #for key in sorted(saturation.keys()):
     #    print key, saturation[key]
     
     #upate num statistics
     myIO.file_os(self.par['sample_log'], '=').line_add(num)
     
     print('\tcombine RCs from unique and multiple alignments of ', self.par['sample_name'])
     #reversed multiple
     #print multiple
     rev_multiple = myDict.basic(multiple).counting_reversed_dict()
     #print unique
     RC_dict = self.multiple_counts(unique, rev_multiple)
     #export
     print('\tSave read counts into ', self.par['sample_RC_file'])
     myDict.basic(RC_dict).dict2_to_file(self.par['sample_RC_file'], pattern='\t')
     myDict.basic(saturation).dict2_to_file(self.par['sample_saturation_file'], pattern='\t')
     #
     seq_counts = {}
     for ref, reads_list in unique_seq.items():
         key=ref+'\t'+self.par['ref_dict'][ref]+'\t'+str(len(reads_list))
         if len(reads_list)>0:
             freq_dict = myList.basic(reads_list).elements_frequency0()
             seq_counts[key] = ';'.join(str(a)+':'+str(b) for a,b in freq_dict.items())
         else:
             seq_counts[key] = 'NA'
     myDict.basic(seq_counts).dict_to_file(self.par['sample_dir']+'unique_aligned_reads.txt', pattern='\t')
    def decompose_fq2(self, par):
        print('The splited FASTQ files are stored into {}'.format(
            par['dir_raw_data']))
        #our directory
        out_dir = myIO.dir_os(par['dir_raw_data']).create_dir()
        #sequencing direction: R1 or R2
        direction = self.R1R2()
        #read relationship between barcode vs sample from sample_file
        barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict()
        #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()}
        barcode_sample['unassigned'] = 'unassigned'
        #print barcode_sample
        #open file handles based on barcode_sample
        file_handle = {}
        barcode_file = {}
        known_dict = {}
        un_dict = {}
        for barcode, sample_name in barcode_sample.items():
            fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction)
            file_handle[barcode] = open(fq_file, 'wt')
            barcode_file[barcode] = fq_file
            known_dict[barcode] = {
                'sample_name': sample_name,
                'read_counts': 0
            }
        ###

        stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|'
        dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15,
                                         '-' * 15)
        print(dash_line)
        print(
            stdout_format.format('Raw reads', 'Assigned reads', 'Percentage',
                                 'Trim reads'))
        print(stdout_format.format('millions', 'millions', '%', 'nt->nt'))
        print(dash_line)
        n = 0  #total number of reads
        m = 0  # total number assigned reads
        #file handle
        #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2:
        F1 = self.readonly_handle(self.biofile)  #fastq_file
        F2 = self.readonly_handle(par['I1_file'])  #I1_file
        F3 = self.readonly_handle(par['I2_file'])  #I2_fie
        with F1, F2, F3:
            #read 4 lines at a time per file
            for L1, La, Le, L2, Lb, Lf, L3, Lc, Lg, L4, Ld, Lh in itertools.zip_longest(
                    *[F1, F2, F3] * 4):
                barcode = Lb.rstrip() + Lf.rstrip()
                rlen = len(L2) - 1
                tag = False
                #assign record based on barcode
                if barcode in file_handle and rlen >= par['seq_min']:
                    L_name = re.sub(r'\/', '#' + barcode + '/', L1)
                    #print L_name, La
                    #trim reads from 5 end or 3-end
                    L2 = L2.rstrip()
                    L4 = L4.rstrip()
                    L2 = L2[par['seq_start']:par['seq_end']] + "\n"
                    L4 = L4[par['seq_start']:par['seq_end']] + "\n"
                    #output file handle
                    file_handle[barcode].writelines([L_name, L2, L3, L4])
                    #counting
                    known_dict[barcode]['read_counts'] += 1
                    m += 1
                    tag = True
                else:
                    #output file handle
                    file_handle['unassigned'].writelines([L1, L2, L3, L4])
                    un_dict[barcode] = un_dict[
                        barcode] + 1 if barcode in un_dict else 1
                    known_dict['unassigned']['read_counts'] += 1
                n += 1
                #export when
                if n >= 1e5 and n % 5e5 == 0:  #million
                    perc = round(m * 100 / n, 2)
                    flen = len(L2) - 1
                    read_info = "{}-->{}".format(
                        rlen, flen) if tag is True else "{}-->X".format(rlen)
                    print(
                        stdout_format.format(n / 1e6, m / 1e6, perc,
                                             read_info))
                #if n==3e6: break
            else:
                print(dash_line)
                print(
                    stdout_format.format(n / 1e6, m / 1e6,
                                         round(m * 100 / n, 2), '---'))
                print(dash_line)
        #calculate percentage
        for bc in known_dict.keys():
            RC = float(known_dict[bc]['read_counts'])
            known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2)
        #close file handle
        for b, F in file_handle.items():
            #close file handle
            F.close()
            #delete empty file
            if os.stat(barcode_file[b]).st_size == 0:
                os.remove(barcode_file[b])
        #export statistics
        myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t')
        myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
Beispiel #16
0
    def demultiplex_fq(self, par):
        #our directory
        out_dir = myIO.dir_os(par['dir_raw_data']).create_dir()
        #sequencing direction: R1 or R2
        direction = self.R1R2()
        #read relationship between barcode vs sample from sample_file
        barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict()
        #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()}
        barcode_sample['unassigned'] = 'unassigned'
        #print barcode_sample
        #open file handles based on barcode_sample
        file_handle = {}
        barcode_file = {}
        known_dict = {}
        un_dict = {}
        for barcode, sample_name in barcode_sample.items():
            fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction)
            file_handle[barcode] = open(fq_file, 'wt')
            barcode_file[barcode] = fq_file
            known_dict[barcode] = {
                'sample_name': sample_name,
                'read_counts': 0
            }
        ###

        #file handle
        #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2:
        F1 = self.readonly_handle(self.biofile)
        F2 = self.readonly_handle(par['index_file'])
        n = 0  #total number of reads
        m = 0  # total number assigned reads
        stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|'
        dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15,
                                         '-' * 15)
        print(dash_line)
        print(
            stdout_format.format('Raw reads', 'Assigned reads', 'Percentage',
                                 'Read Length'))
        print(stdout_format.format('millions', 'millions', '%', 'nt'))
        print(dash_line)
        with F1, F2:
            #read 4 lines at a time per file
            for L1, La, L2, Lb, L3, Lc, L4, Ld in itertools.zip_longest(
                    *[F1, F2] * 4):
                barcode = Lb.rstrip()
                #assign record based on barcode
                if barcode in file_handle and len(L2) >= par['seq_min']:
                    L_name = re.sub(r'\/', '#' + barcode + '/', L1)
                    #print L_name, La
                    #trim reads from 5 end
                    if par['seq_start'] > 0:
                        L2 = L2[par['seq_start']:]
                        L4 = L4[par['seq_start']:]
                    #trim the longer reads from 3-end
                    if par['seq_end'] != 0:
                        L2 = L2.rstrip()
                        L4 = L4.rstrip()
                        L2 = L2[:par['seq_end']] + "\n"
                        L4 = L4[:par['seq_end']] + "\n"
                    #output file handle
                    file_handle[barcode].writelines([L_name, L2, L3, L4])
                    #counting
                    known_dict[barcode]['read_counts'] += 1
                    m += 1
                else:
                    #output file handle
                    file_handle['unassigned'].writelines([L1, L2, L3, L4])
                    un_dict[barcode] = un_dict[
                        barcode] + 1 if barcode in un_dict else 1
                    known_dict['unassigned']['read_counts'] += 1
                n += 1
                #export when
                if m >= 1e6 and m % 1e6 == 0:  #million
                    print(
                        stdout_format.format(n / 1e6, m / 1e6,
                                             round(m * 100 / n, 2),
                                             len(L2) - 1))
                #if n==3e6: break
            else:
                print(dash_line)
                print(
                    stdout_format.format(n / 1e6, m / 1e6, m * 100 / n, '---'))
                print(dash_line)
        #calculate percentage
        for bc in known_dict.keys():
            RC = float(known_dict[bc]['read_counts'])
            known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2)
        #close file handle
        for b, F in file_handle.items():
            #close file handle
            F.close()
            #delete empty file
            if os.stat(barcode_file[b]).st_size == 0:
                os.remove(barcode_file[b])
        #export statistics
        myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t')
        myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
def par_command(argv):
    phip_libs = ['human', 'virus', 'allergome', 'provirome', 'toxome', 'mouse', 'PE', 'zika', 'arbo', 'LISH']
    #initiate parameter
    na_str='fq_file,barcode_file,index_file,I1_file,I2_file,dir_raw_data,dir_in,out,dir_result'
    par=dict([(key, 'NA') for key in na_str.split(',')])
    par.update({'ref_libs':phip_libs[:2], 'seq_start':0, 'seq_end':None, 'seq_min':10})
    usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \
                '-f <fastq file> -i <index file> -b <barcode file>\n'
    try:
        opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:r:l:x:y:m:n:c:",["help",\
            "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_5end", 'len_trim',\
            'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','ref_library'])
    except getopt.GetoptError:
        print(usage_out)
        sys.exit(2)
      
    #get parameters 
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print(usage_out)
            #common usage
            # python Process_FASTQ.py -f * -i * -b * -o * -y *" 
            print("-h --help\tUsage information of this script.")
            print("-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)")
            print("-f --fastq_file\tFastq file determined by a sequencing analyzer.")
            print("-i --index_file\tIndex file matched with the fastq file.")
            print("-b --barcode_file\tBarcode file matched with the index file.")
            print("-o --raw_data\tDirectory storing demulitplexed *fastq files.")
            print("-y --out\tDirectory storing sample_info.csv and variables.txt.")
            print("-c --ref_library\tReference libraries can be any of {}, default is {}.".format(phip_libs, phip_libs[:2]))
            sys.exit(2)
        elif opt in ("-f", "--fastq_file"):
            par['fq_file'] = os.path.abspath(arg)
        elif opt in ("-i", "--index_file"):
            par['index_file'] = os.path.abspath(arg)
        elif opt in ("-b", "--barcode_file"):
            par['barcode_file'] = os.path.abspath(arg)
        elif opt in ("-o", "--raw_data"):
            par['dir_raw_data'] = myIO.dir_os(os.path.abspath(arg)).create_dir()
        elif opt in ('-x', "--dir_in"):
            par['dir_in'] = os.path.abspath(arg)
            par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in'])
        elif opt in ('-y', "--out"):
            par['out'] = arg
        elif opt in ("-l", "--min_len"):
            # discard shorter reads due to poor sequencing
            par['seq_min'] = abs(int(arg))
        elif opt in ("-t", "--trim_5end"):
            #trim_end5: length of nt from the 5-end
            par['seq_start'] = abs(int(arg))
        elif opt in ("-r", "--fixed_len"):
            #len_trim: length of nt after trimming 5-end and 3-end
            par['seq_len'] = abs(int(arg))
            par['seq_end'] = par['seq_start'] + par['seq_len']
        elif opt in ("-m", "--I1_file"):
            par['I1_file'] = os.path.abspath(arg)
        elif opt in ("-n", "--I2_file"):
            par['I2_file'] = os.path.abspath(arg)
        elif opt in ("-c", "--ref_library"):
            libs = arg.split(',')
            par['ref_libs'] = [x for x in libs if x in phip_libs]
    #   
    myDict.basic(par).print_dict()
    return par