Example #1
0
def main(fragment_file, lead_file):
    fragment_mols = read_file(fragment_file)
    lead_mols = read_file(lead_file)
    fragment_mols += lead_mols

    logging.info("Read %s molecules for fragmentation library",
                 len(fragment_mols))
    logging.info("Read %s lead moleculs", len(lead_mols))

    fragments, used_mols = get_fragments(fragment_mols)
    logging.info("Num fragments: %s", len(fragments))
    logging.info("Total molecules used: %s", len(used_mols))
    assert len(fragments)
    assert len(used_mols)
    encodings, decodings = get_encodings(fragments)
    save_decodings(decodings)
    logging.info("Saved decodings")

    lead_mols = np.asarray(
        fragment_mols[-len(lead_mols):])[used_mols[-len(lead_mols):]]

    X = encode_list(lead_mols, encodings)

    logging.info("Building models")
    actor, critic = build_models(X.shape[1:])

    X = clean_good(X, decodings)

    logging.info("Training")
    history = train(X, actor, critic, decodings)
    logging.info("Saving")
    np.save("History/history.npy", history)
 def get_instructors(self, path):
     """A function that assigns instructor's information (cwid, name, dept) to instructor's cwid."""
     try:
         for cwid, name, dept in read_file(path, 3, sep = '\t', header=False):
             self.instructors[cwid] = Instructor(cwid, name, dept)
     except ValueError as e:
         print(e)
 def reading_students(self, path):
     """A function that assigns student's information (cwid, name, major) to his/her cwid."""
     try:
         for cwid, name, major in read_file(path, 3, '\t', header=False):
             self.students[cwid] = Student(cwid, name, major)
     except ValueError as e:
         print(e)
Example #4
0
def get_places():
	"""Extracts entities from text and returns lists and links to wikipedia"""

	text = file_reader.read_file(session['filepath'])
	target_lang = session['target_lang']
	source_lang = (session['source_lang']).encode('ascii', 'replace')


	#Uses the NER tagger to get entities
	if source_lang == 'de':
		nouns = german.pos(text)
		organizations, locations, people = german.ner(text)
	elif source_lang == 'es':
		nouns = spanish.pos(text)
		organizations, locations, people = spanish.ner(text)
	else:
		nouns = text_processing.nouns_only(text_processing.preprocess(text))
		organizations, locations, people = text_processing.ner_tagger(text)


	#Checks the type of entity that is being requested
	ent = request.form['ent']
	
	if ent == "places":

		if locations: 
			loclist = wikipedia.get_entity_info(locations, target_lang, source_lang)
			downfile = file_reader.write_csv_file(loclist)
			geocodes = geocoding.geocode(locations)
			return render_template("places.html", locations = loclist, geocodes = json.dumps(geocodes), downfile=downfile)
		else: 
			return render_template("places.html")

	elif ent == "organizations":

		if organizations:
			orglist = wikipedia.get_entity_info(organizations, target_lang, source_lang)
			downfile = file_reader.write_csv_file(orglist)
			return render_template("orgs.html", organizations = orglist, downfile=downfile)
		else:
			return render_template("orgs.html")

	elif ent == "people":

		if people:
			peoplelist = wikipedia.get_entity_info(people, target_lang, source_lang)
			downfile = file_reader.write_csv_file(peoplelist)
			return render_template("people.html", people = peoplelist, downfile=downfile)
		else:
			return render_template("people.html")

	elif ent == "nouns":
			if nouns: 	
				nounlist = wikipedia.get_entity_info(nouns, target_lang, source_lang)
				downfile = file_reader.write_csv_file(nounlist)
				return render_template("other.html", nouns = nounlist, downfile=downfile)
			else:
				return render_template("other.html")
Example #5
0
def main(fragment_file, lead_file):
    fragment_mols = read_file(fragment_file)
    lead_mols = read_file(lead_file)
    fragment_mols += lead_mols

    logging.info("Read %s molecules for fragmentation library",
                 len(fragment_mols))
    logging.info("Read %s lead molecules", len(lead_mols))

    fragments, used_mols = get_fragments(fragment_mols)
    logging.info("Num fragments: %s", len(fragments))
    logging.info("Total molecules used: %s", len(used_mols))
    assert len(fragments)
    assert len(used_mols)
    # =============================================================================
    #    encodings, decodings = get_encodings(fragments)
    #    save_encodings(encodings)
    #    save_decodings(decodings)
    #    logging.info("Saved encodings and decodings")
    # =============================================================================
    lead_mols = np.asarray(
        fragment_mols[-len(lead_mols):])[used_mols[-len(lead_mols):]]

    decodings = read_decodings()
    encodings = read_encodings()
    logging.info("Loaded encodings and decodings")

    X = encode_list(lead_mols, encodings)
    #print(X.shape)
    if X.shape[0] == 0:
        return -1
    logging.info("Building models")
    actor, critic = build_models(X.shape[1:])

    # X = clean_good(X, decodings)
    # logging.info("Remaining molecules after clean good: %s",X.shape[0])

    if X.shape[0] == 0:
        return -1
    logging.info("Training")
    history = train(X, actor, critic, decodings)
    logging.info("Saving")
    np.save("History/history.npy", history)
    actor.save('./saved_models/generation')
    critic.save('./saved_models/critic')
Example #6
0
def measure_spell_correction(path):
    path = "/Users/anant/pythonWD/data_corrected 2/";
    path += "spell_checking_task_v2/";
    
    all_folders_to_train = ["graphics","medicine","motorcycles","religion","space","atheism","autos"];
    count=0;
    error = 0;
    confusion_set = spell_checker.get_confusion_set(path);
    for folder in all_folders_to_train:
        dev_results_path = path+folder+"/dev_results/";     
        all_dev_op_files = file_reader.list_all_text_files(dev_results_path);
        for file in all_dev_op_files:
            devop = file_reader.read_file(file);
            train_file = file.replace("dev_results", "train_docs");
            devop_file_text = file_reader.read_file(file);
            train_file_text = file_reader.read_file(train_file);
            devop_list = nltk.word_tokenize(devop_file_text);
            train_list = nltk.word_tokenize(train_file_text);
            devop_confusion_list = [];
            train_confusion_list = [];
            for w in devop_list:
                if(w in confusion_set):
                    devop_confusion_list.append(w);
            for w in train_list:
                if(w in confusion_set):
                    train_confusion_list.append(w);
            if(len(train_confusion_list)!= len(devop_confusion_list)):
                print("Number of confused words is not same in train and corrected data, this shouldn't have happen\n");
            error_in_file = 0;
            j = 0;
            while j < len(train_confusion_list):
                if(train_confusion_list[j] != devop_confusion_list[j]):
                    error_in_file += 1;
                j=j+1;
            if(len(train_confusion_list) == 0 ):
                if(len(devop_confusion_list) == 0):
                    print("No confusion words found in:", file, "\n");
            else:
                 count+=1;
                 error += error_in_file/len(train_confusion_list);        
            
        print("Mean error for the folder:", error/count, "\n"); 
Example #7
0
def main(fragment_file, lead_file):
    fragment_mols = read_file(fragment_file)
    lead_mols = read_file(lead_file)
    fragment_mols += lead_mols


    fragments, used_mols = get_fragments(fragment_mols)
    encodings, decodings = get_encodings(fragments)
    save_decodings(decodings)

    lead_mols = np.asarray(fragment_mols[-len(lead_mols):])[used_mols[-len(lead_mols):]]

    X = encode_list(lead_mols, encodings)

    actor, critic = build_models(X.shape[1:])

    X = clean_good(X, decodings)

    history = train(X, actor, critic, decodings)

    np.save("History/history.npy", history)
Example #8
0
def start_processing(data_file):
    data = fr.read_file(data_file)

    # Встановимо найбільше можливе значення для ваги
    max_weight = sys.maxsize
    optimum_path = None
    path_common = []

    path_common = pc.start_path_generator(data, path_common,
                                          cg.weight_calculator)

    while len(
            list(
                filter(
                    lambda path_dictionary_converted:
                    (path_dictionary_converted['alive'] == True) and
                    (len(path_dictionary_converted['path']) < len(data)),
                    path_common))) > 0:

        short_path_dictionary_converted = sorted(
            path_common, key=lambda path: path['weight'])[0]
        path_common.remove(short_path_dictionary_converted)

        removed_connections = [
            i for i in range(len(data))
            if short_path_dictionary_converted['path'].count(i) == 0
        ]

        for connection in removed_connections:
            path_common, max_weight, optimum_path = pu.path_inserter(
                data, path_common, connection, short_path_dictionary_converted,
                max_weight, optimum_path, cg.paths_calculator,
                cg.weight_calculator, ds.duplicate_searcher)

    maximum_solutions = math.factorial(len(data))
    unexplored_percentage = (maximum_solutions -
                             len(path_common)) / maximum_solutions * 100

    result = ""
    paths_string = ""

    for path in path_common:
        result = result + 'Шлях: {}, Вага: {}, C: {}'.format(
            [x + 1 for x in path['path']], path['weight'], path['C']) + "\n"

    # for x in optimum_path:
    result = result + '\nВага найкращого шляху: ' + str(max_weight)
    result = result + '\nНайкращий шлях: ' + str([x + 1 for x in optimum_path])
    result = result + '\n% Відсічення: ' + '{}%'.format(
        int(unexplored_percentage))

    return result
 def get_grades(self, path):
     """A function that adds courses and grades to students and instructors."""
     try:
         for student_cwid, course, grade, instructor_cwid in read_file(path, 4, sep = '\t', header=False):
             if student_cwid in self.students:
                 self.students[student_cwid].add_course(course, grade)
             else:
                 print("unknown student")
             if instructor_cwid in self.instructors:
                 self.instructors[instructor_cwid].add_course(course)
             else:
                 print("instructor not found")
     except ValueError as e:
         print (e)
Example #10
0
def get_entities():
	"""Loads all entity names into a JSON for highlighting"""

	text = file_reader.read_file(session['filepath'])
	session['text'] = (text)
	source_lang = session['source_lang']

	if source_lang == 'de':
		fullset =text_processing.single_set(german.ner(text))
	elif source_lang == 'es':
		fullset =text_processing.single_set(spanish.ner(text))
	else:
		fullset = text_processing.single_set(text_processing.ner_tagger(text))

	return json.dumps(fullset)
Example #11
0
    def test_reading_file_succeeds(self):
        path_to_test_file = os.path.join(os.path.dirname(__file__), 'sample_1',
                                         'routes.geojson')
        expected_content = """{
"type": "FeatureCollection",
"name": "routes",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [

]
}
"""
        geojson = read_file(path_to_test_file)
        self.assertIsNotNone(geojson)
        self.assertEquals(geojson, expected_content)
def do_spell_correction(path, dev_set=1):

    n_gram = 2

    path += "spell_checking_task_v2/"

    print("***Running spell correction at: ", path, " \n ***")
    confusion_set = get_confusion_set(path)
    all_folders_to_train = [
        "graphics", "medicine", "motorcycles", "religion", "space", "atheism",
        "autos"
    ]
    '''                   BUILD LANGUAGE MODELS FOR ALL THE FOLDERS             '''

    folder_index = 0

    probability_table = [dict() for x in range(7)]

    while (folder_index < len(all_folders_to_train)):
        print("\n Processing folder ", all_folders_to_train[folder_index])
        #read all text files
        all_text_files = []
        folder_path = path + all_folders_to_train[folder_index] + "/train_docs/"
        all_text_files += file_reader.list_all_text_files(folder_path)

        #preprocess each file
        all_preprocessed_text = ""
        for file in all_text_files:
            file_text = file_reader.read_file(file)
            all_preprocessed_text += " " + preprocessor.preprocess(file_text)
            sent_end_chars = [".", "!", "?"]

        token_dictionary = tokenizer.tokenize(all_preprocessed_text, n_gram,
                                              sent_end_chars, 1)

        probability_table[
            folder_index] = probability_calculator.generate_probability_table(
                token_dictionary, n_gram)
        folder_index += 1
    print("\n Done processing all the folders\n")
    print("\n Done processing all the folders\n")
    generate_fixed_files(all_folders_to_train, confusion_set,
                         probability_table, path, dev_set)
Example #13
0
def post_process(text):
  f = open('data/temp.txt', 'w')
  f.write(text)
  f.close()

  command = 'sh ../pos_tagger.sh data/temp.txt 2> data/temp_info.txt > data/temp_tagged.txt'
  subprocess.call(command, shell=True)

  pos_lines = file_reader.read_file_lines('data/temp_tagged.txt')
  pos_info_array = sanitize_pos(pos_lines)
  # print(pos_info_array)

  command = 'sh ../parser_dependencies.sh data/temp.txt 2> data/temp_info.txt > data/temp_dependencies.txt'
  subprocess.call(command, shell=True)

  dependencies_content = file_reader.read_file('data/temp_dependencies.txt')
  dep_info_array = sanitize_dep(dependencies_content)
  # print(dep_info_array)

  knowledge.generate(pos_info_array, dep_info_array)
Example #14
0
def main():
    try:
        user_option = int(
            input("1- Create new instance \n2- Read a file\nOPTION: "))
    except:
        print("ERROR: USER INPUT")

    if user_option == 1:
        instance_generator()
    elif user_option == 2:
        clusters_amount, capacity, customers, weights, distance_matrix = read_file(
        )
        solved_clusters_hwe, solved_clusters_weights_hwe, objective_function_hwe = heaviest_weight_edge(
            len(weights), clusters_amount, capacity, weights, distance_matrix)
        solved_clusters_gch, solved_clusters_weights_gch, objective_function_gch = greedy_construction_heuristic(
            len(weights), clusters_amount, capacity, weights, distance_matrix)
        local_search(solved_clusters_hwe, solved_clusters_weights_hwe,
                     distance_matrix, objective_function_hwe)
        local_search(solved_clusters_gch, solved_clusters_weights_gch,
                     distance_matrix, objective_function_gch)
def learn_and_answer(input_file):
    info = read_file(input_file)
    error_msgs = info['error_msgs']
    
    if len(info['ref_words']) > 0:
        # init the robot with default answer
        robot = MerchantRobot(DEFAULT_ANSWER)

        # build the robot's ref words book
        result = robot.learn_knowledge(info['ref_words'], info['price_msgs'])
        if result:
            error_msgs.extend(result)

        # use to robot to answer questions
        result = robot.answer_questions(info['questions'])
        if result:
            print("\n".join(result))
        if error_msgs:
            print("\n".join(error_msgs))
    else:
        print("no ref words found")
Example #16
0
import os
from file_reader import read_file
from file_writer import write_output
from problem_solver import ProblemSolver

# Input

files = [
    'a_example.txt', 'b_lovely_landscapes.txt', 'c_memorable_moments.txt',
    'd_pet_pictures.txt', 'e_shiny_selfies.txt'
]

# End Input

example_file = os.path.join('data', files[0])
example = read_file(example_file)
exampleSlicer = ProblemSolver(example['header'], example['rows'],
                              os.path.join('output', 'example.txt'))
exampleSlicer.output_solution()

small_file = os.path.join('data', files[1])
small = read_file(small_file)
smallSlicer = ProblemSolver(small['header'], small['rows'],
                            os.path.join('output', 'small.txt'))
smallSlicer.output_solution()

medium_file = os.path.join('data', files[2])
medium = read_file(medium_file)
mediumSlicer = ProblemSolver(medium['header'], medium['rows'],
                             os.path.join('output', 'medium.txt'))
mediumSlicer.output_solution()
Example #17
0

def p_communication(p):
    '''
    communication : talk EXCLAMATION communication
                | talk EXCLAMATION
    '''


def p_talk(p):
    '''
    talk : SEND LB ID SEMICOLON ID SEMICOLON MESSAGE RB
    '''
    controller.send_message(p[3], p[5], p[7])


# Error rule for syntax errors
def p_error(p):
    if p:
        print("Syntax error in input at '%s', at line %s character %s." % (p.value, p.lineno, p.lexpos))


# Build the parser
parser = yacc.yacc()


if __name__ == '__main__':
    file_name = input("Enter name of file to execute: ")
    s = read_file("tests/%s.txt" % file_name)
    parser.parse(s)
Example #18
0
from optimizer_core import Optimizer

parser = argparse.ArgumentParser(description='Parallelize eBPF program')
parser.add_argument('-i',
                    '--input',
                    type=str,
                    required=True,
                    help='eBPF dump input file name')
parser.add_argument('-o',
                    '--output',
                    type=str,
                    help='parallelized bin file name')

args = parser.parse_args()
in_file = args.input

out_file = os.path.splitext(
    args.input)[0] + ".bin" if args.output is None else args.output

program_bin, program_str = read_file(in_file)

parallelizer = Optimizer(program_bin,
                         program_str,
                         filename=os.path.splitext(args.input)[0],
                         branch_all_lanes=False,
                         lane_forward_constraint=True)

parallelizer.optimize()

write_program_to_file(filename=out_file + ".out",
                      parallel_program=parallelizer.resource_table)
Example #19
0
import sys
import file_reader
import runner


def initialize_parking_lot(line):
    command, arg = file_reader.parse_line(line)
    return runner.create_parking_lot(command, arg)


if __name__ == '__main__':
    """
    python -m main ABSOLUTE_FILE_PATH or RELATIVE_FILE_PATH or 'tests/mocks/input.txt'
    """
    try:
        input_file = sys.argv[1]
    except (IndexError, NameError):
        print("Please pass a valid file to see how can I run Parking lot.")
        exit()

    with file_reader.read_file(input_file) as file:
        parking_lot = initialize_parking_lot(file.readline())
        for line in file:
            command, args = file_reader.parse_line(line)
            runner.execute_command(parking_lot, command, args)
Example #20
0
    if original_struct_time.tm_zone == 'GMT':  # original_struct_time is UTC based
        return calendar.timegm(original_struct_time)
    else:  # original_struct_time is in local time zone
        return time.mktime(original_struct_time)


def calculate_struct_time(time_float, zone):
    if zone == 'GMT':
        return time.gmtime(time_float)
    else:  # calculate local time based on time_float
        return time.localtime(time_float)


if __name__ == '__main__':

    dict_list = file_reader.read_file('timestamps.json')
    print('dict_list = ', dict_list)

    for d in dict_list:
        original_struct_time = time.strptime(d['timestamp'],
                                             d['format_string'])
        time_float = get_time_float(original_struct_time)
        calculated_struct_time = calculate_struct_time(
            time_float, original_struct_time.tm_zone)
        calculated_timestamp = time.strftime(d['format_string'],
                                             calculated_struct_time)

        print('********************************************')
        print(f'Original timestamp:   {d["timestamp"]} ')
        print('Original struct_time:    ', original_struct_time,
              original_struct_time.tm_zone, original_struct_time.tm_gmtoff)
Example #21
0
# Define a rule so we can track line numbers
def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)


# Characters to ignore
t_ignore = ' \t'


# Error rule
def t_error(t):
    print("ERROR: Illegal character '%s', at position %s, %s." %
          (t.value[0], t.lineno, t.lexpos))
    t.lexer.skip(1)


# Build the lexer
lexer = lex.lex()

if __name__ == '__main__':
    # Read the input
    lexer.input(read_file("tests/test.txt"))

    while True:
        tok = lexer.token()
        if not tok:
            break
        print(tok)
Example #22
0
        folders_to_train = [folder];
    
n_gram = int( input("\n\nEnter n for n-gram: "));


#read all text files
all_text_files = [];
for folder in folders_to_train:
    folder_path = path+"/classification task/"+folder+"/train_docs/";
    all_text_files += file_reader.list_all_text_files(folder_path);
    

#preprocess each file
all_preprocessed_text = "";
for file in all_text_files:
    file_text = file_reader.read_file(file);
    all_preprocessed_text += " "+preprocessor.preprocess(file_text);
    
sent_end_chars = [".","?","!"];

#Tokenize the corpus
#last argument in the function call below is for lowercasing 
token_dictionary = tokenizer.tokenize( all_preprocessed_text, n_gram, sent_end_chars, 1 );

#get cdf
cum_probability = probability_calculator.generate_cdf(token_dictionary, n_gram);

#
# Printing Cumulative probability table.
# Uncomment this block to print the Cumulative probability table.
#
attribute_types = {
    "country": "categorical",
    "year": "numeric",
    "sex": "categorical",
    "age": "categorical",
    "suicides_no": "numeric",
    "population": "numeric",
    "suicides/100k pop": "numeric",
    "country-year": "categorical",
    "HDI for year": "numeric",
    "gdp_for_year ($)": "numeric",
    "gdp_per_capita ($)": "numeric",
    "generation": "categorical"
}

data = read_file("data.csv", attribute_types)

# step 1 and 2
print_numeric_attribute_info(data)

print('\n')

print_categorical_attribute_info(data)

# step 4
plot_histograms(data)

# step 5
for attribute in data:
    attribute.fill_missing_values()
Example #24
0
 def test_reading_nonexisting_file_fails(self):
     content = read_file("nonexisting")
     self.assertIsNone(content)
Example #25
0
 def get_file_content():
     file_path = request.args.get("filePath")
     return file_reader.read_file(file_path)
Example #26
0
import os
from file_reader import read_file
from file_writer import write_output
from problem_solver import ProblemSolver
from Slide import Slide
# Input

files = [
    'a_example.txt', 'b_loveley_landscapes.txt', 'c_memorable_moments.txt',
    'd_pet_pictures.txt', 'e_shiny_selfies.txt'
]

# End Input

example_file = os.path.join('data', files[0])
example = read_file(example_file)

print(example)
' will change to slide from their function'
photos = example['rows']
slides = []  # they will pass this
for photo in photos:
    if photo['orientation'] == 'H':
        # print(photo['tags'])
        slides.append(Slide(photo))
''''''
Example #27
0
data_directory = '/media/ian/B2D61116D610DC831/8-7-18 Wavefronts/'
files = os.listdir(data_directory)

train_filelist, test_filelist = segment_files(files)
write_filelist('train_filelist.csv', train_filelist)
write_filelist('test_filelist.csv', test_filelist)

convlstm_model = model.get_model()
convlstm_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
                       loss='mse')

num_epochs = 100

for epoch in range(num_epochs):
    print('Epoch {}'.format(epoch))
    for file in train_filelist:
        if (file.startswith('wavefront')):
            input_batch, output_batch = batch_data(
                file_reader.read_file(data_directory + file))
            convlstm_model.train_on_batch(input_batch, output_batch)
    average_error = 0
    for file in test_filelist:
        input_batch, output_batch = batch_data(
            file_reader.read_file(data_directory + file))
        error = convlstm_model.test_on_batch(input_batch, output_batch)
        average_error += error
    average_error = average_error / len(test_filelist)
    print('Average error: {}'.format(average_error))
    convlstm_model.save('convlstm_model2.h5')
Example #28
0
mean_speedup = 0
mean_unused_lanes = 0


def print_program(program):
    print()
    print("Program:")
    for inst in range(len(program)):
        print_unpkd(unpack_instruction(program[inst]), inst)
    print()


for file in files:
    print("====================================================")
    program_bin, program_str = read_file("xdp_prog_dump/" + file)

    print("File: " + file)
    print(" ~ with constraints")

    parallelizer = Optimizer(program_bin,
                             program_str,
                             filename=file,
                             branch_all_lanes=True,
                             lane_forward_constraint=True,
                             debug_print_blocks_pre_sched=True,
                             debug_print_blocks_pre_opt=False,
                             debug_draw_cfg=True)

    parallelizer.optimize()
Example #29
0
parser = argparse.ArgumentParser(
    description='Mercury. Sends GeoJSON or TCX files to Polaris server.')
parser.add_argument('--file', dest='file', help='The file to send')
parser.add_argument('--type',
                    dest='type',
                    help='Type of file to send (geojson/tcx)',
                    default='geojson')
parser.add_argument('--server',
                    dest='server_address',
                    help='URL of the Polaris server where to send the data',
                    default='http://127.0.0.1:3000/')
args = parser.parse_args()

if not args.file:
    parser.print_help()
    exit()

logging.debug("Mercury starting.")
data = read_file(args.file)
if data is not None:
    logging.debug("Sending " + args.file + " to Polaris server at " +
                  args.server_address)
    if args.type == 'geojson':
        endpoint = PolarisAPI.geojson.value
    if args.type == 'tcx':
        endpoint = PolarisAPI.tcx.value
    client = PolarisClient(args.server_address)
    client.send_data(data, endpoint)

logging.debug("Mercury finished.")
def generate_fixed_files(all_folders_to_train,
                         confusion_set,
                         probability_table,
                         path,
                         dev_set=1):
    folder_index = 0
    while (folder_index < len(all_folders_to_train)):
        folder = all_folders_to_train[folder_index]
        probability_model = probability_table[folder_index]
        print("\n Processing folder ", folder)
        if (dev_set):
            bad_files_path = path + folder + "/train_modified_docs/"
            fixed_files_path = path + folder + "/dev_results/"
        else:
            bad_files_path = path + folder + "/test_modified_docs/"
            fixed_files_path = path + folder + "/test_docs/"

        #folder_path = path+all_folders_to_train[folder_index]+"/train_docs/";

        all_text_files = []
        all_text_files += file_reader.list_all_text_files(bad_files_path)
        #folder_path);
        #count = 0;
        for file in all_text_files:
            file_text = file_reader.read_file(file)
            file_text_list = nltk.word_tokenize(file_text)
            i = 0
            prev_word = ("<s>", )
            file_list_len = len(file_text_list)
            while i < file_list_len:
                current_word = file_text_list[i]
                if (prev_word in ['.', '!', "?"]):
                    prev_word = ("<s>", )

                if (current_word in confusion_set):
                    #print("testing for ", current_word, "\n");
                    current_word1 = confusion_set[current_word]
                    if (not (prev_word in probability_model)):
                        #    print("didn't find prev_word ", prev_word, " replacing it with <unk>\n");
                        prev_word = ("<unk>", )
                    #   prev_word = file_text_list[i];
                    #   i=i+1;
                    #   continue;
                    probability_row = probability_model[prev_word]
                    prob = 0
                    prob1 = 0
                    if (not (current_word in probability_row)):
                        if (current_word1 in probability_row):
                            prob1 = probability_row[current_word1]
                        else:
                            prob1 = 0
                            prob = 1
    #                    print("didn't find current_word ", current_word, " replacing it with <unk>\n");
                    else:
                        prob = probability_row[current_word]
                        if (current_word1 in probability_row):
                            prob1 = probability_row[current_word1]
                        else:
                            prob1 = 0

    #                    current_word = "<unk>";
    #                if( not(current_word1 in probability_row) ):
    #                   print("didn't find conf_word ", current_word1, " replacing it with <unk>\n");
    #                   current_word1 = "<unk>";

    #                prob1 = probability_row[ current_word];
    #                prob2 = probability_row[current_word1];
                    if (prob < prob1):
                        new_word = confusion_set[file_text_list[i]]
                        if (file_text_list[i].isupper()):
                            new_word = new_word.capitalize()
                    # print("Replaced ", file_text_list[i], " with ", new_word, "\n");
                        file_text_list[i] = new_word
                prev_word = (file_text_list[i], )
                i = i + 1
            corrected_file_text = " ".join(file_text_list)
            ##create a a file and write
            corrected_file_name = file.split('/')[-1].replace("_modified", "")
            print(fixed_files_path + corrected_file_name)
            wfile = open(fixed_files_path + corrected_file_name, "w+")
            wfile.write(corrected_file_text)
            wfile.close()
            #count +=1;
            #if(count>10): break;
        folder_index += 1
Example #31
0
#coding=utf-8
'''
Created on 2016.10.13

@author: xiaoq
'''

import json
from file_reader import read_file
from user import *
from func import *

if __name__ == '__main__':
    records = read_file("data.txt")
    name2QQ_id = {}
    users = {}
    for record in records:
        #循环处理聊天记录
        #record是一个字典,包括QQ_id,name,timestamp,words等keys
        QQ_id = record["QQ_id"]
        #生成新用户,并不断更新用户name
        if not users.has_key(QQ_id):
            newUser = User(QQ_id, record["name"])
            name2QQ_id[record["name"]] = QQ_id
            newUser.addRecord(record)
            users[QQ_id] = newUser
        else:
            users[QQ_id].name = record["name"]
            name2QQ_id[record["name"]] = QQ_id
            users[QQ_id].addRecord(record)
    #建立QQ号和昵称之间的映射,以及index与QQ号之间的映射
Example #32
0
 def test_read_file(self):
     for key, value in read_file('input.txt').items():
         self.assertEqual(value, self.info[key])
Example #33
0
        return formatted_slices

    def output_solution(self):
        num_slices = self.get_num_slices()
        formatted_slices = self.get_slices_formatted()
        output = num_slices + formatted_slices
        write_output(self.output_file, output)


example_file = os.path.join('data', 'a_example.in')
small_file = os.path.join('data', 'b_small.in')
medium_file = os.path.join('data', 'c_medium.in')
big_file = os.path.join('data', 'd_big.in')

example = read_file(example_file,
                    ['rows', 'cols', 'minIngredients', 'maxCellsPerSlice'],
                    ['row'])
small = read_file(small_file,
                  ['rows', 'cols', 'minIngredients', 'maxCellsPerSlice'],
                  ['row'])
medium = read_file(medium_file,
                   ['rows', 'cols', 'minIngredients', 'maxCellsPerSlice'],
                   ['row'])
big = read_file(big_file,
                ['rows', 'cols', 'minIngredients', 'maxCellsPerSlice'],
                ['row'])

exampleSlicer = PizzaSlicer(example['header'], example['rows'], 'example.out')
smallSlicer = PizzaSlicer(small['header'], small['rows'], 'small.out')
mediumSlicer = PizzaSlicer(medium['header'], medium['rows'], 'medium.out')
bigSlicer = PizzaSlicer(big['header'], big['rows'], 'big.out')
Example #34
0
#coding=utf-8
'''
Created on 2016.10.13

@author: xiaoq
'''

import json
from file_reader import read_file
from user import *
from func import *


if __name__ == '__main__':
    records = read_file("data.txt")
    name2QQ_id = {}
    users = {}
    for record in records:
        #循环处理聊天记录
        #record是一个字典,包括QQ_id,name,timestamp,words等keys
        QQ_id = record["QQ_id"]
        #生成新用户,并不断更新用户name
        if not users.has_key(QQ_id):
            newUser = User(QQ_id, record["name"])
            name2QQ_id[record["name"]] = QQ_id
            newUser.addRecord(record)
            users[QQ_id] = newUser
        else:
            users[QQ_id].name = record["name"]
            name2QQ_id[record["name"]] = QQ_id
            users[QQ_id].addRecord(record)