Ejemplo n.º 1
0
def main():

    files = [x for x in helper.get_files('.') if x.endswith('_complete.zip')]

    num_cores = multiprocessing.cpu_count()

    Parallel(n_jobs=num_cores)(delayed(process_file)(f) for f in files)
Ejemplo n.º 2
0
def main():

    files = helper.get_files('customer review data')

    for f in files:
        reviews = process_file(f)
        generate_data(f, reviews)
        break

    pass
Ejemplo n.º 3
0
def process_dir(data_dir):
    """
    Processes a directory containing a set of case documents and generates the citation data.
    The citation data thus generated shall be stored in {data_dir}.txt.
    """

    case_files = helper.get_files(data_dir)
    ops = parse_files(case_files)

    # Write to file
    np.savetxt(data_dir + '.txt', ops, fmt='%s')
Ejemplo n.º 4
0
def process_dir(data_dir):
    """
    Processes a directory containing a set of case documents and generates the citation data.
    The citation data thus generated shall be stored in {data_dir}.txt.
    """

    case_files = helper.get_files(data_dir)
    ops = parse_files(case_files)

    # Write to file
    np.savetxt(data_dir + '.txt', ops, fmt='%s')
Ejemplo n.º 5
0
def process_dir(data_dir, b_size=None):
    """
    Processes a directory containing a set of case documents and extracts the case data.
    The case data thus extracted shall be stored in {data_dir}/txt/.
    """

    case_files = helper.get_files(data_dir)

    if b_size is not None:
        case_files = case_files[:b_size]

    target_path = os.path.join(data_dir, 'txt')

    write_files(parse_files(case_files), target_path)

    return target_path
Ejemplo n.º 6
0
def process_dir(data_dir, b_size=None):
    """
    Processes a directory containing a set of case documents and extracts the case data.
    The case data thus extracted shall be stored in {data_dir}/txt/.
    """

    case_files = helper.get_files(data_dir)

    if b_size is not None:
        case_files = case_files[:b_size]

    target_path = os.path.join(data_dir, 'txt')

    write_files(parse_files(case_files), target_path)

    return target_path
Ejemplo n.º 7
0
def main():
    #Arg handling
    if (len(sys.argv) == 4):
        #Give a default value
        TopK = 10
    elif (not len(sys.argv) == 5):
        usage()

    pathToMfcc = os.path.abspath(
        sys.argv[1])  #root path under which .mfcc are present
    kmeansCenters = os.path.abspath(sys.argv[2])  #kmeans centers
    outpath = os.path.abspath(sys.argv[3])  #dir for the output
    TopK = sys.argv[4]  #soft clustering

    #find out the number of kmeans centers by counting the number of rows
    clusterCount = str(len(open(kmeansCenters).readlines()))

    #get all the mfcc files
    print "Searching for " + os.environ["FORMAT"] + " files"
    mfccFiles = helper.get_files(pathToMfcc, os.environ["FORMAT"])

    count = 1
    name = os.path.join(outpath, "run_0.sh")
    script = open(name, "w")
    for mfcc in mfccFiles:
        if (count % SPLIT == 0):
            script.close()
            qsub(name)
            name = os.path.join(outpath, "run_" + str(count / SPLIT) + ".sh")
            script = open(name, "w")

        #create the new txyc file
        basename = os.path.basename(mfcc)
        txycFile = os.path.join(outpath, (basename.split("."))[0] + ".txyc")
        bofFIle = os.path.join(outpath, (basename.split("."))[0] + ".bof")

        script.write(
            mfcc2txyc(mfcc, kmeansCenters, clusterCount, outpath, TopK) +
            ";\n")
        script.write(txyc2bof(txycFile, clusterCount, outpath, TopK) + ";\n")
        count += 1

    #complete the last script
    script.close()
    qsub(name)
Ejemplo n.º 8
0
def main():

    base_dir = 'Apex AD2600 Progressive-scan DVD player'
    output_dir = 'Apex AD2600 Progressive-scan DVD player_output'

    files = helper.get_files(base_dir)

    total_precision = 0
    total_recall = 0
    total_f1_score = 0
    count = 0
    total_accuracy = 0

    for f in files:

        if 'no_label_' not in f:
            continue

        true = f.replace('no_label_', '')
        empirical = os.path.join(output_dir, os.path.basename(f))

        precision, recall, f1, accuracy = score_file(empirical, true)

        if precision is not None:
            count += 1
            total_precision += precision
            total_recall += recall
            total_f1_score += f1
            total_accuracy += accuracy

            print(f, 'Precision: ', precision, 'Recall: ', recall, 'F1', f1,
                  'accuracy', accuracy)

    pre = total_precision / count
    rec = total_recall / count
    f1s = total_f1_score / count
    acc = total_accuracy / count

    print('\n\nSummary:\n\n')
    print('Precision: ', pre, 'Recall: ', rec, 'F1:', f1s, 'Accuracy: ', acc)
Ejemplo n.º 9
0
else:
    mode = 'file'

assert os.path.isdir(quartus_path)

# Thanks http://web.engr.oregonstate.edu/~sllu/tools/vhdl.html
vlib = quartus_path + '\\modelsim_ase\\win32aloem\\vlib.exe'
assert os.path.isfile(vlib)

vcom = quartus_path + '\\modelsim_ase\\win32aloem\\vcom.exe'
assert os.path.isfile(vcom)

quartus_project_dir = root + '\\quartus\\'

cmd = [vlib, 'work']
if run_cmd(quartus_project_dir, cmd) != 0:
    print('vlib failed')
    sys.exit(1)

if mode == 'file':
    cmd = [vcom, '-93', root + vhdl_src]
    if run_cmd(quartus_project_dir, cmd) != 0:
        print('vcom failed')
        sys.exit(1)

if mode == 'dir':
    for file in get_files(root + vhdl_src, '.vhd'):
        cmd = [vcom, '-93', '-check_synthesis', file]
        if run_cmd(quartus_project_dir, cmd) != 0:
            print('vcom failed')
            sys.exit(1)
Ejemplo n.º 10
0
def is_valid_entry(points) -> bool:
    if np.isnan(points) or points == 0:
        return False
    return True


def get_season(filename: str) -> str:
    return "_".join(filename.split("_")[1:])


def get_team(filename: str) -> str:
    return filename.split("_")[0]


raw_files = get_files("raw_data")
columns = ["player", "team", "season", "date", "points"]

for file in list(raw_files):
    df = pd.read_csv(file)
    clean_df = pd.DataFrame(columns=columns)

    date_cols = df.columns[1:]
    for index, row in df.iterrows():
        for date_col in date_cols:
            if is_valid_entry(row[date_col]):
                clean_date = convert_date(date_col, int(file.stem[-4:]))
                clean_df = clean_df.append(
                    {
                        "player": row["player"],
                        "team": get_team(file.stem),
Ejemplo n.º 11
0
def process_dir(data_dir, MIN_N_GRAM, MAX_N_GRAM, b_verbose=False, b_size=None):
    """
    Processes a directory containing a set of case documents and generates n-grams.
    The n-grams thus generated shall be stored in {data_dir}/n-grams/
    """

    target_dir = os.path.join(data_dir, 'n_grams')

    # Make sure the target directory exists
    helper.ensure_dir(target_dir)

    # Get the case file list
    case_files = helper.get_files(data_dir)

    if b_size is not None:
        case_files = case_files[:b_size]

    total_count = len(case_files)
    progress = 0

    for case_file in case_files:

        # Compute the path to save the file
        target_file_name = os.path.basename(case_file)
        target_path = os.path.join(target_dir, target_file_name)

        # Read the case data from the string
        case_data = helper.read_file_to_string(case_file)

        valid_n_grams = {}

        # Go over every sentence in the document
        for sentence in get_sentences(case_data):

            pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence))

            # Update the grammar if required and get the POS tags
            pos_tags = get_pos_tags(pos_tuples)

            # Generate N-Grams of tags
            n_grams = []
            for n in range(MIN_N_GRAM, MAX_N_GRAM + 1):
                n_grams.extend([list(grams) for grams in ngrams(range(len(pos_tuples)), n)])

            # Get only the n-grams that match the defined grammar
            for i in range(len(n_grams)):

                # Generate n-gram list and check validity
                if parse([pos_tags[j] for j in n_grams[i]]):

                    # Append words to overall list
                    elements = ' '.join([pos_tuples[k][0] for k in n_grams[i]])

                    if elements in valid_n_grams:
                        valid_n_grams[elements] += 1
                    else:
                        valid_n_grams[elements] = 1

        # Save n-grams to file
        helper.save_dict_to_file(target_path, valid_n_grams)

        progress += 1

        if b_verbose:
            print(progress / (0.01 * total_count), ' % Complete')

    return target_dir