Exemple #1
0
 def permute_taxon_blast(self, hits_num):
     print('permutation of viral blast:{}\t{}'.format(self.par['type'], hits_num))
     #
     counts_df = pd.DataFrame()
     outfile = '{}{}.txt'.format(myIO.dir_os(self.par['dir_out']).create_dir(), hits_num)
     if os.path.isfile(outfile):
         print('Read file: ', outfile)
         counts_df = pd.read_csv(outfile, header=0, index_col=0, sep="\t", low_memory=False)  
     else:
         #1: permutated peptides
         pep_names = list(self.par['binary_aln_df'].index)
         pep_df = myList.basic(pep_names).permute_list(self.par['permutation_times'], hits_num)
         #2: permutation based on the non-overlapped hits num
         for col, perm_pep in pep_df.items():
             perm_zb = self.par['binary_aln_df'].ix[perm_pep]
             p_collapse_zb, p_sim_tag = myDataframe.basic(perm_zb).unispecie(self.par['sim_threshold'])
             counts_df[col] = p_collapse_zb.apply(sum,axis=0) + p_sim_tag
             #print list(perm_tmp[col])
         #export
         counts_df.to_csv(outfile, sep='\t', header=True, index_label=self.par['type'])
     #combine permuated counts
     #print counts_df.shape
     perm_mean = counts_df.apply(lambda x: np.mean(np.floor(x)), axis=1).round()
     #print perm_mean
     return perm_mean
 def seek_fq(self,dir_raw_data):
     print('Retrieve all *.fastq files under', dir_raw_data)
     raw_files = []
     #get all files
     all_files = myIO.dir_os(dir_raw_data).recrusive_files() 
     #find file with .fastq or .fq
     for af in all_files:
         m = re.search(r'fastq$|fq$', af)
         if m:
             #print 'raw data:',af
             raw_files.append(af)
     return raw_files
Exemple #3
0
    def phipseq_alignment(self, sample_name):
        print('\n######Anslysis of {} will be trigerred!#####'.format(
            sample_name))
        #initiate sample par
        sample_var = dict(self.par)
        sample_var['start_time'] = time.time()
        #sample name
        sample_var['sample_name'] = sample_name
        #sample directory
        sample_dir = self.par['sample_dirs'][sample_name]
        sample_var['sample_dir'] = myIO.dir_os(sample_dir).create_dir()
        print('\tSample directory: ', sample_var['sample_dir'])
        #raw data
        sample_var['sample_raw_files'] = ','.join(
            sample_var['sample_to_raw'][sample_name])
        print('\tRaw files: ', sample_var['sample_raw_files'])
        #export
        sample_var['file_head'] = sample_var['sample_dir'] + sample_name
        #default same file
        sample_var['sample_sam_file'] = sample_var['file_head'] + '.sam'
        #file of read counts
        sample_var['sample_RC_file'] = sample_var['file_head'] + '_RC.txt'
        sample_var['sample_pro_sumRC_file'] = sample_var[
            'file_head'] + '_pro_sumRC.txt'
        sample_var['sample_pro_maxRC_file'] = sample_var[
            'file_head'] + '_pro_maxRC.txt'
        #file for saturation analysis
        sample_var['sample_saturation_file'] = sample_var[
            'file_head'] + '_saturation.txt'
        #sample log
        sample_var['sample_log'] = sample_var['file_head'] + '.log'

        #sequence alignment
        if sample_var['phip_alignment'] == 'yes':
            print("\n###sequence alignment", sample_var['tool_aligner'])
            #output is sam file
            if sample_var['tool_aligner'] == 'bowtie1':
                myAlign.alignment(sample_var).bowtie1_alignment()

        #counts reads
        if sample_var['phip_counting'] == 'yes':
            #RC matrix by peptides
            myAlign.alignment(sample_var).count_reads()
            #RC matrix by proteins
            if 'file_annotation' in self.par.keys():
                self.combine_peptides(sample_var)

        #update sample log
        sample_times = mySystem.system().get_time(sample_var['start_time'])
        sample_times['sample_name'] = sample_name
        myIO.file_os(sample_var['sample_log'], '=').line_replace(sample_times)
    def match_fasta(self):
        files = myIO.dir_os(self.par['dir_out']).incrusive_files()
        #select a fasta file
        fa_files = filter(lambda x: x.endswith(('.fa', '.fasta')), files)
        self.par['match_fa'] = mySystem.system().select_key(fa_files)
        #select a gtf or gff file
        gtf_files = filter(lambda x: x.endswith(('.gtf', '.gff3')), files)
        self.par['match_gtf'] = mySystem.system().select_key(gtf_files)

        #match
        if par['web_site'] == 'ENSEML':
            myGenome.genome(par['match_fa']).match_ensembl_fa(par['match_gtf'])
        elif par['web_site'] == 'NCBI':
            myGenome.genome(par['match_fa']).match_ncbi_fa(par['match_gtf'])
    def init_dir_file(self):
        self.par['dir_home'] = myIO.dir_os(self.par['dir_home']).create_dir()
        print('home directory of phip tool:', self.par['dir_home'])
        #dir_home = /home/yuan/phip/

        #alignment related
        self.par['dir_aligner'] = self.par['dir_home'] + 'bowtie1/'
        self.par['aligner_options'] = '{}bowtie {}'.format(
            self.par['dir_aligner'], self.par['aligner_options'])
        self.par['genome_index'] = self.par['dir_aligner'] + self.par[
            'genome_index_name']
        self.par['dir_ref_seq'] = self.par['dir_home'] + 'ref_seq/'
        self.par['file_ref_fa'] = '{}{}.fa'.format(
            self.par['dir_ref_seq'], self.par['genome_index_name'])
        if 'file_annotation' in self.par.keys():
            self.par['file_annotation'] = self.par['dir_ref_seq'] + self.par[
                'file_annotation']
        #
        #judge ref library human or virus
        if 'VirScan' in self.par['genome_index_name']:
            self.par['lib'] = 'virus'
            self.par[
                'file_NC'] = self.par['dir_ref_seq'] + 'virus_BeadsOnly.txt'
        elif 'human' in self.par['genome_index_name']:
            self.par['lib'] = 'human'
            self.par[
                'file_NC'] = self.par['dir_ref_seq'] + 'human_BeadsOnly.txt'
        elif 'PublicEpitope' in self.par['genome_index_name']:
            self.par['lib'] = 'PE'
        elif 'LISH' in self.par['genome_index_name']:
            self.par['lib'] = 'LISH'

        #dir of raw data
        if 'dir_raw_data' not in self.par.keys():
            self.par['dir_raw_data'] = myIO.dir_os(self.par['dir_home'] +
                                                   'raw_data').create_dir()
        #results related
        if 'dir_result' not in self.par.keys():
            self.par['dir_result'] = myIO.dir_os(self.par['dir_home'] +
                                                 'result').create_dir()
        #print('Result directory', self.par['dir_result'])
        if 'dir_result_array' not in self.par.keys():
            self.par['dir_result_array'] = self.par['dir_result']

        #dir of statistics
        self.par['dir_stat'] = myIO.dir_os(self.par['dir_result'] +
                                           'statistics').create_dir()
        self.par['dir_QC'] = myIO.dir_os(self.par['dir_stat'] +
                                         'QC').create_dir()
        self.par['dir_enrichment'] = myIO.dir_os(self.par['dir_stat'] +
                                                 'enrichment').create_dir()

        #sample info
        self.par[
            'file_sample_info'] = self.par['dir_result'] + 'sample_info.csv'
        self.par['dir_log'] = self.par['dir_result'] + 'sample_log/'
        self.par['file_log'] = self.par['dir_result'] + 'output.log'
        self.par['file_total_log'] = self.par['dir_result'] + 'Total.log'
        self.par['file_stat'] = self.par['dir_QC'] + 'statistics.csv'
        self.par['file_ref_txt'] = self.par['dir_result'] + 'references.txt'
        self.par[
            'file_pro_pep'] = self.par['dir_result'] + 'protein_peptides.txt'
        #raw data related
        #print(self.par['dir_raw_data'])
        #
        self.par['RC_levels'] = ['lowRC']  #lowRC, midRC, highRC
        self.par['phip_levels'] = ['pep', 'promax', 'prosum']
        files_dict = {}
        for pl in self.par['phip_levels']:
            file_head = '{}{}_'.format(self.par['dir_stat'], pl)
            #raw reads
            files_dict[pl + '_RC'] = file_head + 'RC.txt'
            #noramlized by total raw counts
            files_dict[pl + '_scalingRC'] = file_head + 'scalingRC.txt'
            files_dict[
                pl + '_scalingRC_prosum'] = file_head + 'scalingRC_prosum.txt'
            files_dict[
                pl + '_scalingRC_promax'] = file_head + 'scalingRC_promax.txt'
            #scalingRC against regressed median of phip sample and regressed sd of negative controls
            files_dict[pl + '_NCPHIPzscores'] = file_head + 'NCPHIPzscores.txt'
            files_dict[
                pl +
                '_NCPHIPzscores_prosum'] = file_head + 'NCPHIPzscores_prosum.txt'
            files_dict[
                pl +
                '_NCPHIPzscores_promax'] = file_head + 'NCPHIPzscores_promax.txt'
        self.par['files_dict'] = files_dict

        #default parameters
        self.par['specieZ_threshold'] = int(
            self.par['specieZ_threshold']
        ) if 'specieZ_threshold' in self.par.keys() else 10
        self.par['align_score'] = float(
            self.par['align_score']) if 'align_score' in self.par.keys(
            ) else 80
        #p value cutoff for binomial testing
        self.par['p_threshold'] = float(
            self.par['p_threshold']) if 'p_threshold' in self.par.keys(
            ) else .001
        #x value is observed successes cutoff for binomial test
        self.par['x_threshold'] = float(
            self.par['x_threshold']) if 'x_threshold' in self.par.keys() else 1
        self.par['sim_threshold'] = float(
            self.par['sim_threshold']) if 'sim_threshold' in self.par.keys(
            ) else 0.8
        self.par['zscore_threshold'] = int(
            self.par['zscore_threshold']
        ) if 'zscore_threshold' in self.par.keys() else 10
        self.par['permutation_times'] = int(
            self.par['permutation_times']
        ) if 'permutation_times' in self.par.keys() else 100
        self.par['threads_num'] = int(self.par['threads_num'])
        self.par['scaling_factor'] = int(
            self.par['scaling_factor']) if 'scaling_factor' in self.par.keys(
            ) else 1e6

        #print self.par
        myDict.basic(self.par).print_dict()
        #
        return (self.par)
Exemple #6
0
 def __init__(self,out_dir):
     self.out_dir = myIO.dir_os(out_dir).create_dir()
     self.url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/'
Exemple #7
0
 def __init__(self, specie, out_dir):
     self.specie = specie
     self.out_dir = myIO.dir_os(out_dir+specie).create_dir()
     #initiate url list
     self.url_list()
    def decompose_fq2(self, par):
        print('The splited FASTQ files are stored into {}'.format(
            par['dir_raw_data']))
        #our directory
        out_dir = myIO.dir_os(par['dir_raw_data']).create_dir()
        #sequencing direction: R1 or R2
        direction = self.R1R2()
        #read relationship between barcode vs sample from sample_file
        barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict()
        #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()}
        barcode_sample['unassigned'] = 'unassigned'
        #print barcode_sample
        #open file handles based on barcode_sample
        file_handle = {}
        barcode_file = {}
        known_dict = {}
        un_dict = {}
        for barcode, sample_name in barcode_sample.items():
            fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction)
            file_handle[barcode] = open(fq_file, 'wt')
            barcode_file[barcode] = fq_file
            known_dict[barcode] = {
                'sample_name': sample_name,
                'read_counts': 0
            }
        ###

        stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|'
        dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15,
                                         '-' * 15)
        print(dash_line)
        print(
            stdout_format.format('Raw reads', 'Assigned reads', 'Percentage',
                                 'Trim reads'))
        print(stdout_format.format('millions', 'millions', '%', 'nt->nt'))
        print(dash_line)
        n = 0  #total number of reads
        m = 0  # total number assigned reads
        #file handle
        #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2:
        F1 = self.readonly_handle(self.biofile)  #fastq_file
        F2 = self.readonly_handle(par['I1_file'])  #I1_file
        F3 = self.readonly_handle(par['I2_file'])  #I2_fie
        with F1, F2, F3:
            #read 4 lines at a time per file
            for L1, La, Le, L2, Lb, Lf, L3, Lc, Lg, L4, Ld, Lh in itertools.zip_longest(
                    *[F1, F2, F3] * 4):
                barcode = Lb.rstrip() + Lf.rstrip()
                rlen = len(L2) - 1
                tag = False
                #assign record based on barcode
                if barcode in file_handle and rlen >= par['seq_min']:
                    L_name = re.sub(r'\/', '#' + barcode + '/', L1)
                    #print L_name, La
                    #trim reads from 5 end or 3-end
                    L2 = L2.rstrip()
                    L4 = L4.rstrip()
                    L2 = L2[par['seq_start']:par['seq_end']] + "\n"
                    L4 = L4[par['seq_start']:par['seq_end']] + "\n"
                    #output file handle
                    file_handle[barcode].writelines([L_name, L2, L3, L4])
                    #counting
                    known_dict[barcode]['read_counts'] += 1
                    m += 1
                    tag = True
                else:
                    #output file handle
                    file_handle['unassigned'].writelines([L1, L2, L3, L4])
                    un_dict[barcode] = un_dict[
                        barcode] + 1 if barcode in un_dict else 1
                    known_dict['unassigned']['read_counts'] += 1
                n += 1
                #export when
                if n >= 1e5 and n % 5e5 == 0:  #million
                    perc = round(m * 100 / n, 2)
                    flen = len(L2) - 1
                    read_info = "{}-->{}".format(
                        rlen, flen) if tag is True else "{}-->X".format(rlen)
                    print(
                        stdout_format.format(n / 1e6, m / 1e6, perc,
                                             read_info))
                #if n==3e6: break
            else:
                print(dash_line)
                print(
                    stdout_format.format(n / 1e6, m / 1e6,
                                         round(m * 100 / n, 2), '---'))
                print(dash_line)
        #calculate percentage
        for bc in known_dict.keys():
            RC = float(known_dict[bc]['read_counts'])
            known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2)
        #close file handle
        for b, F in file_handle.items():
            #close file handle
            F.close()
            #delete empty file
            if os.stat(barcode_file[b]).st_size == 0:
                os.remove(barcode_file[b])
        #export statistics
        myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t')
        myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
Exemple #9
0
    def demultiplex_fq(self, par):
        #our directory
        out_dir = myIO.dir_os(par['dir_raw_data']).create_dir()
        #sequencing direction: R1 or R2
        direction = self.R1R2()
        #read relationship between barcode vs sample from sample_file
        barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict()
        #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()}
        barcode_sample['unassigned'] = 'unassigned'
        #print barcode_sample
        #open file handles based on barcode_sample
        file_handle = {}
        barcode_file = {}
        known_dict = {}
        un_dict = {}
        for barcode, sample_name in barcode_sample.items():
            fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction)
            file_handle[barcode] = open(fq_file, 'wt')
            barcode_file[barcode] = fq_file
            known_dict[barcode] = {
                'sample_name': sample_name,
                'read_counts': 0
            }
        ###

        #file handle
        #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2:
        F1 = self.readonly_handle(self.biofile)
        F2 = self.readonly_handle(par['index_file'])
        n = 0  #total number of reads
        m = 0  # total number assigned reads
        stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|'
        dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15,
                                         '-' * 15)
        print(dash_line)
        print(
            stdout_format.format('Raw reads', 'Assigned reads', 'Percentage',
                                 'Read Length'))
        print(stdout_format.format('millions', 'millions', '%', 'nt'))
        print(dash_line)
        with F1, F2:
            #read 4 lines at a time per file
            for L1, La, L2, Lb, L3, Lc, L4, Ld in itertools.zip_longest(
                    *[F1, F2] * 4):
                barcode = Lb.rstrip()
                #assign record based on barcode
                if barcode in file_handle and len(L2) >= par['seq_min']:
                    L_name = re.sub(r'\/', '#' + barcode + '/', L1)
                    #print L_name, La
                    #trim reads from 5 end
                    if par['seq_start'] > 0:
                        L2 = L2[par['seq_start']:]
                        L4 = L4[par['seq_start']:]
                    #trim the longer reads from 3-end
                    if par['seq_end'] != 0:
                        L2 = L2.rstrip()
                        L4 = L4.rstrip()
                        L2 = L2[:par['seq_end']] + "\n"
                        L4 = L4[:par['seq_end']] + "\n"
                    #output file handle
                    file_handle[barcode].writelines([L_name, L2, L3, L4])
                    #counting
                    known_dict[barcode]['read_counts'] += 1
                    m += 1
                else:
                    #output file handle
                    file_handle['unassigned'].writelines([L1, L2, L3, L4])
                    un_dict[barcode] = un_dict[
                        barcode] + 1 if barcode in un_dict else 1
                    known_dict['unassigned']['read_counts'] += 1
                n += 1
                #export when
                if m >= 1e6 and m % 1e6 == 0:  #million
                    print(
                        stdout_format.format(n / 1e6, m / 1e6,
                                             round(m * 100 / n, 2),
                                             len(L2) - 1))
                #if n==3e6: break
            else:
                print(dash_line)
                print(
                    stdout_format.format(n / 1e6, m / 1e6, m * 100 / n, '---'))
                print(dash_line)
        #calculate percentage
        for bc in known_dict.keys():
            RC = float(known_dict[bc]['read_counts'])
            known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2)
        #close file handle
        for b, F in file_handle.items():
            #close file handle
            F.close()
            #delete empty file
            if os.stat(barcode_file[b]).st_size == 0:
                os.remove(barcode_file[b])
        #export statistics
        myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t')
        myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
Exemple #10
0
    #pass arguments
    start, end = sys.argv[1].split('-')
    par = {
        'specie_permutation': 'yes',
        'organism_permutation': 'yes',
        'threads_num': 24,
        'start': int(start),
        'end': int(end) + 1,
        'align_score': 80,
        'sim_threshold': 0.8,
        'dir_bin': dir_bin + '/',
        'dir_home': dir_home + '/',
        'permutation_times': 100
    }
    par['dir_permutation'] = myIO.dir_os(par['dir_home'] +
                                         'permutation/').create_dir()

    print('###permutation procedure\n\n')
    pool = mpd.Pool(processes=par['threads_num'])

    #permuation of organism alignment
    if par['organism_permutation'] == 'yes':
        #read aln file
        file_aln = par['dir_home'] + 'ref_seq/organism_blast.txt'
        par['binary_aln_df'] = myDataframe.basic().aln_df(
            file_aln, par['align_score'])
        par['type'] = myIO.file_os(file_aln).name_prefix()
        par['dir_out'] = myIO.dir_os(par['dir_home'] + 'permutation/' +
                                     par['type']).create_dir()
        #
        for hits_num in range(par['start'], par['end']):
Exemple #11
0
def par_command(argv):
    phip_libs = ['human', 'virus', 'PE', 'allergome', 'LISH']
    #initiate parameters
    par = {'fq_file':'NA','barcode_file':'NA','index_file':'NA','I1_file':'NA','I2_file':'NA', \
        'dir_raw_data':'NA', 'dir_raw':'NA','dir_in':'NA', 'out':'NA', \
        'dir_result':'NA', 'multiplexing_mode':0, 'ref_libs':phip_libs[:2], \
        'seq_start':0, 'seq_end':0, 'seq_min':10, 'seq_max':0 }
    usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \
                '-f <fastq file> -i <index file> -b <barcode file>\n'
    try:
        opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:l:x:y:m:n:z:c:",["help",\
                "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_len",\
                'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','dir_raw','ref_library'])
    except getopt.GetoptError:
        print(usage_out)
        sys.exit(2)

    #get parameters
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print(usage_out)
            #common usage
            # python Process_FASTQ.py -f * -i * -b * -o * -y *"
            print("-h --help\tUsage information of this script.")
            print(
                "-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)"
            )
            print(
                "-f --fastq_file\tFastq file determined by a sequencing analyzer."
            )
            print("-i --index_file\tIndex file matched with the fastq file.")
            print(
                "-b --barcode_file\tBarcode file matched with the index file.")
            print(
                "-o --raw_data\tDirectory storing demulitplexed *fastq files.")
            print(
                "-y --out\tDirectory storing sample_info.csv and variables.txt."
            )
            print(
                "-c --ref_library\tReference libraries can be one of {}, default is {}."
                .format(phip_libs, phip_libs[:2]))
            sys.exit(2)
        elif opt in ("-f", "--fastq_file"):
            par['fq_file'] = os.path.abspath(arg)
            par['multiplexing_mode'] += 1
        elif opt in ("-i", "--index_file"):
            par['index_file'] = os.path.abspath(arg)
            par['multiplexing_mode'] += 1
        elif opt in ("-b", "--barcode_file"):
            par['barcode_file'] = os.path.abspath(arg)
            par['multiplexing_mode'] += 1
        elif opt in ("-o", "--raw_data"):
            par['dir_raw_data'] = myIO.dir_os(
                os.path.abspath(arg)).create_dir()
        elif opt in (
                "-z",
                "--all_raw_data"):  # only for one more sets of fastq splits
            par['dir_raw'] = myIO.dir_os(os.path.abspath(arg)).create_dir()
        elif opt in ('-x', "--dir_in"):
            par['dir_in'] = os.path.abspath(arg)
            par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in'])
        elif opt in ('-y', "--out"):
            par['out'] = arg
        elif opt in ("-l", "--fixed_len"):
            len_min, len_max = arg.split(':')
            par['seq_min'] = abs(int(len_min))
            par['seq_max'] = abs(int(len_max))
        elif opt in ("-t", "--trim_len"):
            trim_end5, trim_end3 = arg.split(':')
            par['seq_start'] = abs(int(trim_end5))
            par['seq_end'] = -abs(int(trim_end3))
        elif opt in ("-m", "--I1_file"):
            par['I1_file'] = os.path.abspath(arg)
        elif opt in ("-n", "--I2_file"):
            par['I2_file'] = os.path.abspath(arg)
        elif opt in ("-c", "--ref_library"):
            libs = arg.split(',')
            par['ref_libs'] = [x for x in libs if x in phip_libs]
    #
    if par['seq_max'] > 0:
        par['seq_end'] = par['seq_max']
    #
    myDict.basic(par).print_dict()
    return par
Exemple #12
0
    if 'fq_files' in par.keys():
        for fq in par['fq_files']:
            myGenome.genome(fq).trim_fq(par['dir_raw_data'], par['seq_start'],
                                        par['seq_end'])

    #generate sample_info file under result dir:
    if par['out'] != 'NA':
        #current dir
        par['dir_bin'] = os.path.dirname(os.path.realpath(__file__)) + '/'
        par['dir_home'] = os.path.abspath(
            os.path.join(par['dir_bin'], os.pardir)) + '/'
        print('Home directory of phip pipsline: ', par['dir_home'])

        #libraries. default is human and virus
        for lib in par['ref_libs']:
            par['dir_result'] = myIO.dir_os(
                os.path.abspath(par['out'] + '_' + lib)).create_dir()
            if os.path.isdir(par['dir_result']):
                #1: sample_info.csv
                par['file_sample_info'] = par['dir_result'] + 'sample_info.csv'
                print('The sample information file: ', par['file_sample_info'])
                #read sample_info.csv
                myParallel.samples(par).export_sample_info()
                #2: copy template variables.txt into lib folder
                template_file = '{}variables_{}.txt'.format(
                    par['dir_bin'], lib)
                var_file = '{}variables.txt'.format(par['dir_result'])
                print('Save {} and then update it.'.format(var_file))
                shutil.copy(template_file, var_file)
                #update parameters of variables.txt
                refresh = {
                    'dir_home': par['dir_home'],
def par_command(argv):
    phip_libs = ['human', 'virus', 'allergome', 'provirome', 'toxome', 'mouse', 'PE', 'zika', 'arbo', 'LISH']
    #initiate parameter
    na_str='fq_file,barcode_file,index_file,I1_file,I2_file,dir_raw_data,dir_in,out,dir_result'
    par=dict([(key, 'NA') for key in na_str.split(',')])
    par.update({'ref_libs':phip_libs[:2], 'seq_start':0, 'seq_end':None, 'seq_min':10})
    usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \
                '-f <fastq file> -i <index file> -b <barcode file>\n'
    try:
        opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:r:l:x:y:m:n:c:",["help",\
            "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_5end", 'len_trim',\
            'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','ref_library'])
    except getopt.GetoptError:
        print(usage_out)
        sys.exit(2)
      
    #get parameters 
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print(usage_out)
            #common usage
            # python Process_FASTQ.py -f * -i * -b * -o * -y *" 
            print("-h --help\tUsage information of this script.")
            print("-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)")
            print("-f --fastq_file\tFastq file determined by a sequencing analyzer.")
            print("-i --index_file\tIndex file matched with the fastq file.")
            print("-b --barcode_file\tBarcode file matched with the index file.")
            print("-o --raw_data\tDirectory storing demulitplexed *fastq files.")
            print("-y --out\tDirectory storing sample_info.csv and variables.txt.")
            print("-c --ref_library\tReference libraries can be any of {}, default is {}.".format(phip_libs, phip_libs[:2]))
            sys.exit(2)
        elif opt in ("-f", "--fastq_file"):
            par['fq_file'] = os.path.abspath(arg)
        elif opt in ("-i", "--index_file"):
            par['index_file'] = os.path.abspath(arg)
        elif opt in ("-b", "--barcode_file"):
            par['barcode_file'] = os.path.abspath(arg)
        elif opt in ("-o", "--raw_data"):
            par['dir_raw_data'] = myIO.dir_os(os.path.abspath(arg)).create_dir()
        elif opt in ('-x', "--dir_in"):
            par['dir_in'] = os.path.abspath(arg)
            par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in'])
        elif opt in ('-y', "--out"):
            par['out'] = arg
        elif opt in ("-l", "--min_len"):
            # discard shorter reads due to poor sequencing
            par['seq_min'] = abs(int(arg))
        elif opt in ("-t", "--trim_5end"):
            #trim_end5: length of nt from the 5-end
            par['seq_start'] = abs(int(arg))
        elif opt in ("-r", "--fixed_len"):
            #len_trim: length of nt after trimming 5-end and 3-end
            par['seq_len'] = abs(int(arg))
            par['seq_end'] = par['seq_start'] + par['seq_len']
        elif opt in ("-m", "--I1_file"):
            par['I1_file'] = os.path.abspath(arg)
        elif opt in ("-n", "--I2_file"):
            par['I2_file'] = os.path.abspath(arg)
        elif opt in ("-c", "--ref_library"):
            libs = arg.split(',')
            par['ref_libs'] = [x for x in libs if x in phip_libs]
    #   
    myDict.basic(par).print_dict()
    return par
        #download the idmapping file
        local_file = myDownload.uniprot(par['dir_out']).download_idmapping()


#################################################################################
if __name__ == "__main__":
    #initiate dictionary saving parameters
    par = {
        'in_out': 'Continue'
    }
    annot = download_annot(par)
    ########################################
    #

    #2: download dir
    par['dir_home'] = myIO.dir_os('/home/yuan/data_preparation/').stdin_dir(
        'Enter the directory path storing downloads files')
    print par['dir_home']
    while (par['in_out'] == 'Continue'):
        #2:select ftp or web site
        web_sites = ['NCBI', 'ENSEML', 'UniProt']
        par['web_site'] = mySystem.system().select_key(
            web_sites, 'Select public database')
        par['dir_out'] = par['dir_home'] + par['web_site'] + '/'
        #1: select file types
        if par['web_site'] in ['NCBI', 'ENSEML']:
            operations = ['Genome annotation', 'match fasta and gtf']
            par['operations'] = mySystem.system().select_key(
                operations, 'What is your operations')
        elif par['web_site'] == 'UniProt':
            par['operations'] = 'UniProt idmapping'