예제 #1
0
import os
import pandas as pd
from bs4 import BeautifulSoup
from Source.mtDNA.tibet.functions.file_system import get_path

path_in = get_path()
path_in += '/Data/phylotree/html/'
path_out = get_path()
path_out += '/Data/phylotree/xlsx/'

phylotrees = {'tree_name': [], 'tree': []}
for curr_file in os.listdir(path_in):
    with open(path_in + curr_file, "r") as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'lxml')

        table = soup.find('table')
        table_rows = table.find_all('tr')
        table_rows_list = []
        for tr in table_rows:
            td = tr.find_all('td')
            row = [i.text for i in td]
            table_rows_list.append(row)

        table_rows_list = [row for row in table_rows_list if len(set(row)) > 1]
        tree_name_index = table_rows_list[0][1].find('subtree') + len(
            'subtree ')
        tree_name = table_rows_list[0][1][tree_name_index:]
        phylotrees['tree_name'].append(tree_name)
        table_rows_list = table_rows_list[16:]
        table_rows_list = [[elem.replace('\xa0', ' ') for elem in table_row]
예제 #2
0
from Source.mtDNA.tibet.functions.file_system import get_path
from Source.mtDNA.high_altitude.functions import *
from Source.mtDNA.high_altitude.infrastructure_functions import *

read_tibet_data = 1

path = get_path()
info_data_path = path + '/Data/alignment/info/'
tibet_data_path = path + '/Data/alignment/tibet/'
world_data_path = path + '/Data/alignment/world/'

tibet_result_path = path + '/Result/tibet_pair/'
if not os.path.exists(tibet_result_path):
    os.makedirs(tibet_result_path)

world_result_path = path + '/Result/world_pair/'
if not os.path.exists(world_result_path):
    os.makedirs(world_result_path)

tibet_data, tibet_subjects, tibet_classes = read_data(tibet_data_path)
regions = get_region_info(info_data_path)

current_tibet_classes = {
    'Asian Low Altitude': [
        '0-500', '501-1000', '1001-1500', '1501-2000', '2001-2500',
        '2501-3000', '3001-4000'
    ],
    'Tibetan High Altitude': ['4001']
}
tibet_subset, tibet_subject_classes = subset_subjects(tibet_data,
                                                      tibet_classes,
예제 #3
0
from Source.mtDNA.tibet.functions.file_system import get_path
import pandas as pd
import os

subject_info = {'subject': [], 'group': [], 'height': []}
data_path = get_path() + '/Data/'
for filename in os.listdir(data_path):
    if filename.endswith('.fasta'):
        f = open(data_path + filename, 'r')
        subjects_lines = [line.rstrip() for line in f][0::2]
        for subject_line in subjects_lines:
            line_list = subject_line.split(' ')
            subject_info['subject'].append(line_list[0][1:])
            group_index = line_list.index('mitochondrion,') - 1
            subject_info['group'].append(line_list[group_index])
            subject_info['height'].append(filename[:-6])
        f.close()

df = pd.DataFrame(subject_info)
writer = pd.ExcelWriter(data_path + 'subjects.xlsx', engine='xlsxwriter')
df.to_excel(writer, index=False, startrow=0)
worksheet = writer.sheets['Sheet1']
writer.save()