コード例 #1
0
ファイル: college.py プロジェクト: 657143946/Graduate
 def _load_college_map():
     ret = {}
     for line in utils.read_file(College.college_map_file):
         key, value = line.split("=")
         assert value in STD_COLLEGE
         ret[key] = value
     return ret
コード例 #2
0
def find_image_depth_matching(scene):
    pairs = uts.read_file(DATA_PATH + scene + '/id_img2depth.txt')
    id_img2depth = {}
    for pair in pairs:
        image_name, depth_name = pair.split(' ')
        id_img2depth[image_name] = depth_name
    return id_img2depth
コード例 #3
0
def run_process(q_recv, q_send, in_folder, out_folder, failed_extractions_file,
                max_tries, use_diffbot):
    """
    Tries 'max_tries' times to extract text using
    At the end, if using diffbot, tries one last time with boilerpipe
    """

    texts, trec_ids = [], []

    def retrieve_texts_from_html(html, use_diffbot=False):
        """ Use the Diffbot API/Boilerpipe to retrieve texts from HTML """

        if use_diffbot:
            dummy_url = 'https://www.diffbot.com/dev/analytics/'
            url_api = "https://api.diffbot.com/v3/article?token=%s" \
                      "&discussion=false&url=%s" % (DIFFBOT_TOKEN, dummy_url)
            headers = {'Content-type': 'text/html'}
            content = json.loads(
                requests.post(url_api, data=html, headers=headers).text)

            text = content["objects"][0]["text"]
            title = content["objects"][0]["title"]

            text = '\n'.join([title, text])
        else:
            text = Extractor(extractor='ArticleExtractor', html=html).getText()

        return text

    while True:
        trec_id = q_recv.get()

        # Check end condition
        if trec_id is None:
            break

        # Check if file exists
        if not os.path.isfile("%s/%s" % (in_folder, trec_id)):
            continue

        # Read HTML
        html = read_file("%s/%s" % (in_folder, trec_id), encoding='latin1')

        i = 0
        while i != max_tries:
            try:
                texts.append(
                    retrieve_texts_from_html(html, use_diffbot=use_diffbot))
                trec_ids.append(trec_id)
                break
            except Exception as e:  # Extraction failed
                # print(e)
                i += 1

        if i == max_tries:
            write_file("%s\n" % trec_id, failed_extractions_file, 'a')

    q_send.put((texts, trec_ids))
コード例 #4
0
def read_sample_dir(root_dir):
    try:
        file_path = root_dir + "/" + "sample_name_dir.txt"
        samples = []
        samples = utils.read_file(file_path, "r")
        return (samples)

    except Exception as e:
        print("Error occurred while reading the file : {} \n {} ".format(
            dir, e))
コード例 #5
0
def process(q_recv, q_send, corpus_folder, remove_stopwords):
    articles = []
    while True:
        cwid = q_recv.get()

        # Check end condition
        if cwid is None:
            break

        # Article is already encoded in UTF-8
        article = read_file("%s/%s" % (corpus_folder, cwid))

        # PREPROCESS
        articles += [preprocess_text(article, tokenize=True, all_lower=True, stopw=remove_stopwords).split()]

    # Send info back
    q_send.put((articles))
コード例 #6
0
def main(argv):
    argv = FLAGS(argv)  # parse argv to FLAG
    scene_names = uts.read_file(DATA_PATH + 'test_sun3d.txt')
    scene_names = scene_names[FLAGS.start:FLAGS.end]

    for scene_name in scene_names:
        scene_name += '/'
        print FLOW_PATH + scene_name + 'flow/'
        # down load file
        if not os.path.exists(DATA_PATH + scene_name + 'id_img2depth.txt'):
            print 'Retrieve Data'
            os.system("/home/peng/SUN3DCppReader/src/build/SUN3DCppReader " \
                      + scene_name + " " + DATA_PATH)
            if not os.path.exists(DATA_PATH + scene_name +
                                  '/id_img2depth.txt'):
                continue

        if not os.path.exists(FLOW_PATH + scene_name + '/flow/'):
            uts.mkdir_if_need(FLOW_PATH + scene_name + '/flow/')
            id_img2depth = find_image_depth_matching(scene_name)
            gen_img_pair_data(scene_name, 500, id_img2depth)
コード例 #7
0
# %%
sheet_name = "Holiday_Package.csv"
target_variable = 'Holliday_Package'
modal = [['Decision_tree', 10], ['random_forest', 5], ['neural_network', 10],
         ['logistic_regression', 3], ['lda', 10], ['knn', 5],
         ['naive_bayes', 3, 2, 'f1'], ['bagging', 3, 2, 'f1'],
         ['ada_boost', 3, 2, 'f1'], ['gradient_boosting', 3, 2, 'f1'],
         ['support_vector_machine', 3]]
modals_data = pd.DataFrame(
    modal, columns=['Modal', 'cross_validation', 'n_jobs', 'scoring'])

# %%
print(modals_data)

# %%
df_holiday_package = utils.read_file(sheet_name)
display(df_holiday_package)

# %%
df_holiday_package = utils.drop_columns_by_colnums(df_holiday_package, [0])
display(df_holiday_package)

# %%
utils.info(df_holiday_package)

# %%
categorical_columns = utils.get_categorical_cols(df_holiday_package)
print(categorical_columns)

# %%
continuos_columns = utils.get_continous_cols(df_holiday_package)
コード例 #8
0
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import pandas as pd
import numpy as np
import sys
import os
import traceback
from IPython.display import display
from utils import utils
from bivariate_analysis import bivariate_analysis

# %%
df_gems = utils.read_file("cubic_zirconia.csv")
display(df_gems)

# %%
df_gems = utils.drop_columns_by_colnums(df_gems, [0])
display(df_gems)

# %%
categorical_cols = utils.get_categorical_cols(df_gems)
display(categorical_cols)

# %%
continous_cols = utils.get_continous_cols(df_gems)
display(continous_cols)

# %%
utils.describe(df_gems, continous_cols)
コード例 #9
0
ファイル: college.py プロジェクト: 657143946/Graduate
 def _load_std_college():
     return set(list(utils.read_file(College.std_college_file)))
コード例 #10
0
import sys

from utils.sql_ops import Sqlite
from utils import utils

field = [
    "id", "name", "name_ruby", "department_1", "department_2", "likes",
    "dislikes", "like_foods", "dislike_foods", "introduction"
]

types = [
    "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT",
    "TEXT"
]

primary = "id"


def insert_chara(data):
    chara = Sqlite("starlightRe.db")
    chara.create("chara", field, types, primary)
    chara.insert("chara", data)
    chara.close()


if __name__ == "__main__":
    data = utils.read_file(sys.argv[1], field)
    print(data[0])
    insert_chara(data)
コード例 #11
0
#file path that contains the list of the samples to be analyzed.
if mv_input.__contains__("No"):
    print(
        "Please provide a path to the .txt file containing list of the samples to be analyzed."
    )
    file_input = input()
    file_check = path.exists(file_input)

    while file_check is False:
        print(
            "Please provide a valid path to the file containing list of the samples to be analyzed."
        )
        file_input = input()
        file_check = path.exists(file_input)

    samples = []
    if file_check is True:
        samples = utils.read_file(file_input, "r")

    if len(samples) > 0:
        for s in samples:
            run_bam_from_fastq.bam_from_fastq(rootDir, s)

haplotypecaller.run_haplotypecaller(rootDir)

combine_gvcfs.run_combine_gvcfs(rootDir)

genotype_gvcfs.run_genotype_vcfs(rootDir)

variant_calibration.run_variant_calibration_method(rootDir)
コード例 #12
0
import os
import traceback
from IPython.display import display
from utils import utils
from modals import modals
from bivariate_analysis import bivariate_analysis

sheet_name="insurance_claim.csv"
target_variable='Claimed'
modal=[['Decision_tree',10],['Random_forest',5,-1],['Neural_Network',10,-1],
['Logistic_Regression',3,-1],['LDA',10,-1],['KNN',5,-1],['Naive_Bayes',3,-1,'f1'],['Bagging',3,-1,'f1'],
['Ada_Boost',3,-1,'f1'],['Gradient_Boosting',3,-1,'f1'],['Support_Vector_Machine',3,-1]]
modals_data = pd.DataFrame(modal,columns=['Modal','cross_validation','n_jobs','scoring'])
display(modals_data)
display(modals_data.iloc[0])
df_insuarnce_claim = utils.read_file(sheet_name)
display(df_insuarnce_claim)

#df_insuarnce_claim=utils.drop_columns_by_colnums(df_insuarnce_claim,[0])
#display(df_insuarnce_claim)
utils.check_duplicate_data(df_insuarnce_claim)
categorical_columns=utils.get_categorical_cols(df_insuarnce_claim)
continous_columns=utils.get_continous_cols(df_insuarnce_claim)
utils.convert_cat_into_code(df_insuarnce_claim,categorical_columns)

modals.check_imbalance(df_insuarnce_claim[target_variable])

X,Y=modals.extract_target_column(df_insuarnce_claim,target_variable)

X_train, X_test, Y_train, Y_test = modals.split_data_into_train_test(X,Y,0.30,1)