def init_wordprocessers(self): """ Initialize Word processors depending on the language """ if self.language.get() == 'English': self.stemmer = stem.snowball.EnglishStemmer() self.corr = correct('en') return 'en' else: self.stemmer = stem.snowball.GermanStemmer() self.corr = correct('de') return 'de'
def dispatch(values=None): if (values == None): return {'error': 'parameter is missing'} if (not (isinstance(values, dict))): return {'error': 'parameter is not a dictionary'} if ('error' in values): values.pop('error') return values if (not ('op' in values) or values['op'] == ''): values['error'] = 'no op is specified' return values #Perform designated function if (values['op'] == 'adjust'): result = adjust.adjust(values) return result elif (values['op'] == 'predict'): result = predict.predict(values) return result elif (values['op'] == 'correct'): result = correct.correct(values) return result elif (values['op'] == 'locate'): result = locate.locate(values) return result else: values['error'] = 'op is not a legal operation' return values
def Spellcheck(self, event): # Spellcheck the word preceding the insertion point index = self.text.search(r'\s', "insert", backwards=True, regexp=True) if index == "": index ="1.0" else: index = self.text.index("%s+1c" % index) # assign last word typed to "word" word = self.text.get(index, "insert") """if word.lower() in self._words: self.text.tag_remove("misspelled", index, "%s+%dc" % (index, len(word)))""" # if there is a match, increment frequency of the word print(word) words = BK_TREE.BKTree.query(self._words,word.strip('.,;\"!:?\'()').lower(),2) print(words) # if not in tree, run correct and replace with most reasonable replacement if len(words) == 0: self.text.tag_add("misspelled", index, "%s+%dc" % (index, len(word))) # if the word is not in the dictionary replace it elif not (words[0][1] == 0): # The replacement word is the result of correct on the word we typed and the array that query returns new_word = correct.correct(word,words) # sequence of checks to add back punctuation if not word == '': start = word[0] end = word[-1] if start in ("\'","(","{","[","$",'\"',"*"): new_word = start + new_word if end in ("\'",")","]",".",",","\"",";",":","?","!","*"): new_word = new_word + end self.text.delete(index, "%s+%dc" % (index, len(word))) self.text.insert(index,new_word)
def mutate(solution: Solution): for path in solution.paths: if random.random() < MUTATION_PROBABILITY: if len(path.points) > 3: change_point = random.randint(1, len(path.points) - 3) change_x = random.randint(0, 1) dir = 1 - 2 * random.randint(0, 1) path.points[change_point].move(dir * change_x, dir * (1 - change_x)) if random.randint(0, 1) == 0: path.points[change_point + 1].move(dir * change_x, dir * (1 - change_x)) else: change = path.points[1] - path.points[0] if change.x == 0: if abs(change.y) > 1: change.x = 1 - 2 * random.randint(0, 1) change.y = math.copysign(1, change.y) * random.randint( 1, abs(change.y)) elif change.y == 0: if abs(change.x) > 1: change.y = 1 - 2 * random.randint(0, 1) change.x = math.copysign(1, change.x) * random.randint( 1, abs(change.x)) path.points.insert(1, path.points[0] + change) path.points = correct(path.points) return solution
def generate_path(board: Board, start: P2D, end: P2D) -> Path: path = Path(start, end) points = [start] # points.extend(generate_random_points(board, start, end)) points.extend(generate_random_points(board)) points.append(end) path.points = correct(points) return path
def get_text(img, classifier, bg_thresh=None, resize=None, min_char_dist=0, min_char_pixels=1, min_line_dist=0, min_line_pixels=1, spell_check=False, **kwargs): """ Get the text of an image with a classifier. Args: get_text_from_regions() arguments: img classifier bg_tresh resize **kwargs min_line_dist (Optional[int]): Minimum distance between each line in line regions. min_line_pixels (Optional[int]): Minimum number of pixels along the height of an image to be considered as containing text. Defaults to 1. min_char_dist (Optional[int]): Minimum distance between each character in character regions. Defaults to 0. min_char_pixels (Optional[int]): Minimum number of pixels along a column in a line region to be considered as containing text. Defaults to 1. spell_check (Optional[bool]): Use spell check if True. Defaults to False. Returns: str: The string extracted from the image. """ line_regs = line_regions(img, bg_thresh=bg_thresh, min_dist=min_line_dist, min_pixels=min_line_pixels) char_regs = character_regions(img, line_regs, bg_thresh=bg_thresh, min_dist=min_char_dist, min_pixels=min_char_pixels) text = get_text_from_regions(img, line_regs, char_regs, classifier, resize=resize, **kwargs) if spell_check: text = " ".join(correct(word) for word in text.split()) return text
def main(): test = "agc would is going to famaly 2 3 verygood u know 实现 , ." tokens, flag_list = judge_word(test) for i,item in enumerate(tokens): if flag_list[i]: logging.info("{} is a word".format(item)) else: logging.info("{} is not a word, may be {}".format(item, correct(item)))
def makeTag(self, field, value): fix = correct.correct() newval = str(value) #newval = html.unescape(newval) newval = newval.replace('&', 'and') newval = newval.replace('"', '') #newval = newval.replace('><', '') tag = dict() # logging.debug("OSM:makeTag(field=%r, value=%r)" % (field, newval)) try: newtag = self.ctable.match(field) except Exception as inst: logging.warning("MISSING Field: %r, %r" % (field, newval)) # If it's not in the conversion file, assume it maps directly # to an official OSM tag. newtag = field newval = self.ctable.attribute(newtag, newval) #logging.debug("ATTRS1: %r %r" % (newtag, newval)) change = newval.split('=') if len(change) > 1: newtag = change[0] newval = change[1] # name tags, usually roads or addresses, often have to be tweaked # for OSM standards if (newtag == "name") or (newtag == "alt_name"): newval = string.capwords(fix.alphaNumeric(newval)) newval = fix.abbreviation(newval) newval = fix.compass(newval) # This is a hack because the CO address data truncates the street, # and we need the whole thing so routing will work to an address. if newtag == 'addr:full': self.full = re.sub(" Unit .*", '', newval) newval = re.sub("^[0-9]* ", '', self.full) newtag = "add:street" # logging.debug("FIXME: FULL %" % self.full) elif newtag == 'addr:housenumber': # logging.debug("FIXME: NUM") self.num = newval elif newtag == 'addr:street': if self.full is not None: newval = re.sub("^[0-9]* ", '', self.full) # newval = self.full.replace(self.num, '') self.full = None self.addr = None tag[newtag] = newval # tag[newtag] = string.capwords(newval) #print("ATTRS2: %r %r" % (newtag, newval)) return tag
def checker(ans, inp): res = False msg = "" corr = correct.correct(inp, "ru") if (ans == corr): res = True if (corr != inp): msg = "(точнее: " + corr + ")" return { "result": res, "msg": msg }
def word_stem_stop_word(reply_text,num): stopwordsfile = open('stopwords.txt') stopwords = stopwordsfile.read().split('\r\n') nltk_word = nltk.word_tokenize(reply_text) nltk_word = nltk.pos_tag(nltk_word) reply = [] proper_nouns = Set([]) for word, tag in nltk_word: if str(word.lower()) not in stopwords: if (tag == 'NNP' or tag == 'NNPS'): proper_nouns.add(word.lower()) else: word = correct.correct(word) word = porter.stem(word) word = correct.correct(word) reply.append(word.lower()) #print reply #print proper_nouns reply = ' '.join(reply) if num: return reply,proper_nouns else: return reply
def correct_route(): print(request.args) query = request.args.get('query') print("[Router] correct: {}".format(query)) corrected = correct(query) if corrected is None: corrected = '' result = { 'corrected': corrected, } print(result) return jsonify(result)
def get_text(img, classifier, bg_thresh=None, resize=None, min_char_dist=0, min_char_pixels=1, min_line_dist=0, min_line_pixels=1, spell_check=False, **kwargs): """ Get the text of an image with a classifier. Args: get_text_from_regions() arguments: img classifier bg_tresh resize **kwargs min_line_dist (Optional[int]): Minimum distance between each line in line regions. min_line_pixels (Optional[int]): Minimum number of pixels along the height of an image to be considered as containing text. Defaults to 1. min_char_dist (Optional[int]): Minimum distance between each character in character regions. Defaults to 0. min_char_pixels (Optional[int]): Minimum number of pixels along a column in a line region to be considered as containing text. Defaults to 1. spell_check (Optional[bool]): Use spell check if True. Defaults to False. Returns: str: The string extracted from the image. """ line_regs = line_regions(img, bg_thresh=bg_thresh, min_dist=min_line_dist, min_pixels=min_line_pixels) char_regs = character_regions( img, line_regs, bg_thresh=bg_thresh, min_dist=min_char_dist, min_pixels=min_char_pixels) text = get_text_from_regions(img, line_regs, char_regs, classifier, resize=resize, **kwargs) if spell_check: text = " ".join(correct(word) for word in text.split()) return text
def dispatch(values=None): #Validate parm if(values == None): return {'error': 'parameter is missing'} if(not(isinstance(values,dict))): return {'error': 'parameter is not a dictionary'} if (not('op' in values)): values['error'] = 'no op is specified' return values #Perform designated function if(values['op'] == 'adjust'): return adjust.adjust(values) elif(values['op'] == 'predict'): return predict.predict(values) #This calculation is stubbed out elif(values['op'] == 'correct'): return correct.correct(values) #This calculation is stubbed out elif(values['op'] == 'locate'): return values #This calculation is stubbed out else: values['error'] = 'op is not a legal operation' return values
def searchByCourse(requset): course = requset.GET.get("course", "") if course == "": return HttpResponse("Request error") print type(course) course = correct.correct(course) print type(course) xml = cache.get("C_" + course) # check whether has some data in redis if xml: return HttpResponse(xml, content_type="application/xml") try: # check whether this couse is in database c = Courses.objects.get(cname=course) # return the historic result xml = getExistCourseRecord(c) cache.set("C_" + course, xml, 60 * 60 * 24) return HttpResponse(xml, content_type="application/xml") except Courses.DoesNotExist: # this course has not been searched before # search it, and store the result in database cr = CourseReptile() t1 = time.time() booksNames = cr.course_search(course) print "search books by course cost : " + repr(time.time() - t1) + "s" # if not correlated book for this course if not len(booksNames): return HttpResponse("No relative book for this course!") c = Courses.objects.create(cname=course, description="") c.save() # count the similar of bookname and course similarNames = [] for bookName in booksNames: p = Levenshtein.ratio(bookName, course) similarNames.append((bookName, p)) # sort the book names by the similar booksNames = sorted(similarNames, key=lambda x: x[1], reverse=True) print booksNames xml = "" for bookName, p in booksNames: # to be implement. this operation should return a list of dictionary sola = solaSpider() t1 = time.time() books = sola.getBookList(bookName, True) print "search books by book cost : " + repr(time.time() - t1) + "s" # some database operation for book in books: print book # book is a dictionary bookid = storeBookItem(book) # construct the return xml xml += getBookItemXml(bookid) # create the relation for this new course and the the relative book r = Relation.objects.create(course=c, bid=bookid, click=0) r.save() if xml == "": return HttpResponse("No relative book for this course!") xml = packXml(xml, c.id, "course") # write in cache cache.set("C_" + course, xml, 60 * 60 * 24) return HttpResponse(xml, content_type="application/xml")
# along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # import logging import html import string import pdb import re import correct import inspect import dejagnu dj = dejagnu.dejagnu() obj = correct.correct() dj.verbose_level(2) # Test the compass corrections. ie... N RoadName' becomes 'North RoadName' instr = "N 126" x = obj.compass(instr) dj.matches(x, 'North 126', "correct.compass(North)") instr = "S 126" x = obj.compass(instr) dj.matches(x, 'South 126', "correct.compass(South)") instr = "E 126" x = obj.compass(instr) dj.matches(x, 'East 126', "correct.compass(East)")
def to_graph(son): """From flat json to python graph""" def dumps(item): return json.dumps(item, indent=4) def printjson(item): print(dumps(item)) categories = json.loads(son) categories = [correct(fixrelations(i)) for i in categories] independents = filter( lambda x: x[strings.based] == strings.independend, categories ) def listToDict(keyFunction, values): return dict((keyFunction(v), v) for v in values) # A list is just a great way to waste time for this usecase independents = listToDict(lambda x: x["Name"], independents) def addChildTo(parent, child): parent[strings.children].append(child) return parent # Recursivly find the parents. True on succes, False on failure # If succesfull the child will be added to the found parents. def findparents(child, bases, parents): if len(bases) == 0: for p in parents: addChildTo(p, child) return True current = bases[0] if len(parents) == 0: try: parents.insert(0, independents[current]) except KeyError as e: printjson(child) raise e return findparents(child, bases[1:], parents) base = next( (x for x in parents[0][strings.children] if x[strings.name] == current), None ) if base is None: if current not in independents: if current == child[strings.name]: # ubuntu dependson ubuntu... yes distrowatch that's just # silly return findparents(child, [], parents) # the base is not added yet to the structure # lets just ignore this one for now. return False parents.insert(0, independents[current]) return findparents(child, bases[1:], parents) parents[0] = base return findparents(child, bases[1:], parents) def deepen(collection): counter = 0 while len(collection) > 0: current = collection[0] basedstr = current[strings.based] bases = basedstr.split(",") if not findparents(current, bases, []): counter += 1 if counter > len(collection) * 10: printjson(list(collection)) raise Exception( "Made five full circles in the deque, the data is " + "just invalid, deque size %i " % counter ) collection.append(current) else: counter = 0 collection.popleft() return collection from collections import deque notindependents = deque(filter( lambda x: not x[strings.based] == strings.independend, categories )) deepen(notindependents) return dumps(list(map(lambda item: item[1], independents.items())))
def process_image(filename): shutil.copy(filename, config.tmpDir) correct.correct(filename)
def generate_midi(config_folder_fd, config_folder_bd, config_folder_corr, score_source, save_folder, initialization_type, number_of_version, duration_gen, num_pass_correct, logger_generate): """This function generate the orchestration of a midi piano score Parameters ---------- config_folder : str Absolute path to the configuration folder, i.e. the folder containing the saved model and the results score_source : str Either a path to a folder containing two midi files (piano and orchestration) or the path toa piano midi files number_of_version : int Number of version generated in a batch manner. Since the generation process involves sampling it might be interesting to generate several versions duration_gen : int Length of the generated score (in number of events). Useful for generating only the beginning of the piece. logger_generate : logger Instanciation of logging. Can be None """ logger_generate.info("#############################################") logger_generate.info("Orchestrating : " + score_source) # Load parameters parameters = pkl.load( open(config_folder_fd + '/script_parameters.pkl', 'rb')) model_parameters_fd = pkl.load( open(config_folder_fd + '/model_params.pkl', 'rb')) # parameters_bd = pkl.load( open(config_folder_bd + '/script_parameters.pkl', 'rb')) model_parameters_bd = pkl.load( open(config_folder_bd + '/model_params.pkl', 'rb')) # parameters_corr = pkl.load( open(config_folder_corr + '/script_parameters.pkl', 'rb')) model_parameters_corr = pkl.load( open(config_folder_corr + '/model_params.pkl', 'rb')) assert (model_parameters_fd["temporal_order"] == model_parameters_bd["temporal_order"]) and ( model_parameters_fd["temporal_order"] == model_parameters_corr["temporal_order"] ), "The two model have different seed_size" assert (parameters["quantization"] == parameters_bd["quantization"]) and ( parameters["quantization"] == parameters_corr["quantization"] ), "The two model have different quantization" assert (parameters["temporal_granularity"] == parameters_bd["temporal_granularity"]) and ( parameters["temporal_granularity"] == parameters_corr["temporal_granularity"] ), "The two model have different temporal_granularity" assert (parameters["instru_mapping"] == parameters_bd["instru_mapping"] ) and (parameters["instru_mapping"] == parameters_corr["instru_mapping"] ), "The two model have different instru_mapping" assert (parameters["normalizer"] == parameters_bd["normalizer"]) and ( parameters["normalizer"] == parameters_corr["normalizer"] ), "The two model have different normalizer" seed_size = max(model_parameters_fd['temporal_order'], 10) - 1 ####################### # Load data if re.search(r'mid$', score_source): pr_piano, event_piano, duration_piano, name_piano, pr_orch, instru_orch, duration = generation_utils.load_solo( score_source, parameters["quantization"], parameters["binarize_piano"], parameters["temporal_granularity"]) else: if initialization_type == "seed": pr_piano, event_piano, duration_piano, name_piano, pr_orch, instru_orch, duration = generation_utils.load_from_pair( score_source, parameters["quantization"], parameters["binarize_piano"], parameters["binarize_orch"], parameters["temporal_granularity"], align_bool=True) else: pr_piano, event_piano, duration_piano, name_piano, pr_orch, instru_orch, duration = generation_utils.load_from_pair( score_source, parameters["quantization"], parameters["binarize_piano"], parameters["binarize_orch"], parameters["temporal_granularity"], align_bool=False) if (duration is None) or (duration < duration_gen): logger_generate.info("Track too short to be used") return ######################## ######################## # Shorten # Keep only the beginning of the pieces (let's say a 100 events) pr_piano = pianoroll_processing.extract_pianoroll_part( pr_piano, 0, duration_gen) if parameters["duration_piano"]: duration_piano = np.asarray(duration_piano[:duration_gen]) else: duration_piano = None if parameters["temporal_granularity"] == "event_level": event_piano = event_piano[:duration_gen] pr_orch = pianoroll_processing.extract_pianoroll_part( pr_orch, 0, duration_gen) ######################## ######################## # Instanciate piano pianoroll N_piano = parameters["instru_mapping"]['Piano']['index_max'] pr_piano_gen = np.zeros((duration_gen, N_piano), dtype=np.float32) pr_piano_gen = build_data_aux.cast_small_pr_into_big_pr( pr_piano, {}, 0, duration_gen, parameters["instru_mapping"], pr_piano_gen) pr_piano_gen_flat = pr_piano_gen.sum(axis=1) silence_piano = [ e for e in range(duration_gen) if pr_piano_gen_flat[e] == 0 ] ######################## ######################## # Initialize orchestra pianoroll with orchestra seed (choose one) N_orchestra = parameters['N_orchestra'] pr_orchestra_truth = np.zeros((duration_gen, N_orchestra), dtype=np.float32) pr_orchestra_truth = build_data_aux.cast_small_pr_into_big_pr( pr_orch, instru_orch, 0, duration_gen, parameters["instru_mapping"], pr_orchestra_truth) if initialization_type == "seed": pr_orchestra_seed = generation_utils.init_with_seed( pr_orch, number_of_version, seed_size, N_orchestra, instru_orch, parameters["instru_mapping"]) elif initialization_type == "zeros": pr_orchestra_seed = generation_utils.init_with_zeros( number_of_version, seed_size, N_orchestra) elif initialization_type == "constant": const_value = 0.1 pr_orchestra_seed = generation_utils.init_with_constant( number_of_version, seed_size, N_orchestra, const_value) elif initialization_type == "random": proba_activation = 0.01 pr_orchestra_seed = generation_utils.init_with_random( number_of_version, seed_size, N_orchestra, proba_activation) ######################## ####################################### # Embed piano time_embedding = time.time() if parameters['embedded_piano']: # Load model embedding_path = parameters["embedding_path"] embedding_model = torch.load(embedding_path, map_location="cpu") # Build embedding (no need to batch here, len(pr_piano_gen) is sufficiently small) # Plus no CUDA here because : afradi of mix with TF + possibly very long piano chunks piano_resize_emb = np.zeros( (len(pr_piano_gen), 1, 128)) # Embeddings accetp size 128 samples piano_resize_emb[:, 0, parameters["instru_mapping"]['Piano'] ['pitch_min']:parameters["instru_mapping"]['Piano'] ['pitch_max']] = pr_piano_gen piano_resize_emb_TT = torch.tensor(piano_resize_emb) piano_embedded_TT = embedding_model(piano_resize_emb_TT.float(), 0) pr_piano_gen_embedded = piano_embedded_TT.numpy() else: pr_piano_gen_embedded = pr_piano_gen time_embedding = time.time() - time_embedding ####################################### ######################## # Inputs' normalization normalizer = pkl.load( open(os.path.join(config_folder_fd, 'normalizer.pkl'), 'rb')) if parameters["embedded_piano"]: # When using embedding, no normalization pr_piano_gen_norm = pr_piano_gen_embedded else: pr_piano_gen_norm = normalizer.transform(pr_piano_gen_embedded) ######################## ######################## # Store folder string = re.split(r'/', name_piano)[-1] name_track = re.sub('piano_solo.mid', '', string) generated_folder = save_folder + '/fd_bd_corr_' + initialization_type + '_init/' + name_track if not os.path.isdir(generated_folder): os.makedirs(generated_folder) ######################## ######################## # Get trainer with open(os.path.join(config_folder_fd, 'which_trainer'), 'r') as ff: which_trainer_fd = ff.read() # Trainer trainer_fd = import_trainer(which_trainer_fd, model_parameters_fd, parameters) # with open(os.path.join(config_folder_bd, 'which_trainer'), 'r') as ff: which_trainer_bd = ff.read() # Trainer trainer_bd = import_trainer(which_trainer_bd, model_parameters_bd, parameters) # with open(os.path.join(config_folder_corr, 'which_trainer'), 'r') as ff: which_trainer_corr = ff.read() # Trainer trainer_corr = import_trainer(which_trainer_corr, model_parameters_corr, parameters) ######################## ############################################################ # Generate ############################################################ time_generate_0 = time.time() model_path = 'model_accuracy' # Forward pr_orchestra_gen = generate(trainer_fd, pr_piano_gen_norm, silence_piano, duration_piano, config_folder_fd, model_path, pr_orchestra_seed, batch_size=number_of_version) prefix_name = 'fd_' generation_utils.reconstruct_generation(pr_orchestra_gen, event_piano, generated_folder, prefix_name, parameters, seed_size) # Backward pr_orchestra_seed = pr_orchestra_gen[:, -seed_size:] pr_orchestra_gen = generate_backward(trainer_bd, pr_piano_gen_norm, silence_piano, duration_piano, config_folder_bd, model_path, pr_orchestra_seed, batch_size=number_of_version) prefix_name = 'bd_' generation_utils.reconstruct_generation(pr_orchestra_gen, event_piano, generated_folder, prefix_name, parameters, seed_size) # Correction for pass_index in range(num_pass_correct): pr_orchestra_gen = correct(trainer_corr, pr_piano_gen_norm, silence_piano, duration_piano, config_folder_corr, model_path, pr_orchestra_gen, batch_size=number_of_version) generation_utils.reconstruct_generation(pr_orchestra_gen, event_piano, generated_folder, prefix_name, parameters, seed_size) prefix_name = 'corr_' + str(pass_index) + '_' time_generate_1 = time.time() logger_generate.info( 'TTT : Generating data took {} seconds'.format(time_generate_1 - time_generate_0)) ############################################################ # Reconstruct and write ############################################################ prefix_name = 'final_' generation_utils.reconstruct_generation(pr_orchestra_gen, event_piano, generated_folder, prefix_name, parameters, seed_size) generation_utils.reconstruct_original(pr_piano_gen, pr_orchestra_truth, event_piano, generated_folder, parameters) return
def test_correct_simple(): assert correct("L0ND0N") == "LONDON" assert correct("DUBL1N") == "DUBLIN" assert correct("51NGAP0RE") == "SINGAPORE" assert correct("BUDAPE5T") == "BUDAPEST" assert correct("PAR15") == "PARIS"
return self.db.query(addr) #epdb.set_trace() dd = config(argv) fcall = plotcalls(dd) fcall.connect() kml = kmlfile() kml.open(dd.get('outfile')) kml.header("TLFPD Calls") calldata = open(dd.get('infile'), 'r') lines = calldata.readlines() fix = correct.correct() for line in lines: if line[1] == '#': continue index = line.find(' ') number = line[:index] street = line[index + 2:] street = street.replace("\n", '') street = street.strip() street = fix.alphaNumeric(street) street = fix.abbreviation(street) street = fix.compass(street) query = "SELECT ST_AsKML(way) from planet_osm_point" query += " WHERE \"addr:housenumber\"='" + number + "'" query += " AND tags->'addr:street'='" + street + "';"
def searchByCourse(requset): course = requset.GET.get("course", "") if course == "": return HttpResponse("Request error") print type(course) course = correct.correct(course) print type(course) xml = cache.get("C_" + course) # check whether has some data in redis if xml: return HttpResponse(xml, content_type="application/xml") try: # check whether this couse is in database c = Courses.objects.get(cname = course) # return the historic result xml = getExistCourseRecord(c) cache.set("C_" + course, xml, 60 * 60 * 24) return HttpResponse(xml, content_type="application/xml") except Courses.DoesNotExist: # this course has not been searched before # search it, and store the result in database cr = CourseReptile() t1 = time.time() booksNames = cr.course_search(course) print "search books by course cost : " + repr(time.time() - t1) + "s" # if not correlated book for this course if not len(booksNames): return HttpResponse("No relative book for this course!") c = Courses.objects.create(cname = course, description = "") c.save() # count the similar of bookname and course similarNames = [] for bookName in booksNames: p = Levenshtein.ratio(bookName, course) similarNames.append((bookName, p)) # sort the book names by the similar booksNames = sorted(similarNames, key = lambda x : x[1], reverse = True) print booksNames xml = "" for bookName, p in booksNames: # to be implement. this operation should return a list of dictionary sola = solaSpider() t1 = time.time() books = sola.getBookList(bookName, True) print "search books by book cost : " + repr(time.time() - t1) + "s" # some database operation for book in books: print book # book is a dictionary bookid = storeBookItem(book) # construct the return xml xml += getBookItemXml(bookid) # create the relation for this new course and the the relative book r = Relation.objects.create(course = c, bid = bookid, click = 0) r.save() if xml == "": return HttpResponse("No relative book for this course!") xml = packXml(xml, c.id, "course") # write in cache cache.set("C_" + course, xml, 60 * 60 * 24) return HttpResponse(xml, content_type="application/xml")
import nltk from nltk.tokenize import RegexpTokenizer from correct import correct new_file = open("new_data.txt", "w") with open("mail_data2.txt", "r") as f: for line in f: tokenizer = RegexpTokenizer('[A-Za-z.,?!]{1,}') a = tokenizer.tokenize(line) b = nltk.pos_tag(a) for word, tag in b: if tag == "NNP": new_file.write(word + " ") if tag != "NNP": word = correct(word.lower()) new_file.write(word + " ") new_file.write("\n") new_file.close()
def test_correct_more(): a = "1F-RUDYARD K1PL1NG" b = "IF-RUDYARD KIPLING" assert correct(a) == b a = "R0BERT MERLE - THE DAY 0F THE D0LPH1N" b = "ROBERT MERLE - THE DAY OF THE DOLPHIN" assert correct(a) == b a = "R1CHARD P. FEYNMAN - THE FEYNMAN LECTURE5 0N PHY51C5" b = "RICHARD P. FEYNMAN - THE FEYNMAN LECTURES ON PHYSICS" assert correct(a) == b a = "R1CHARD P. FEYNMAN - 5TAT15T1CAL MECHAN1C5" b = "RICHARD P. FEYNMAN - STATISTICAL MECHANICS" assert correct(a) == b a = "5TEPHEN HAWK1NG - A BR1EF H15T0RY 0F T1ME" b = "STEPHEN HAWKING - A BRIEF HISTORY OF TIME" assert correct(a) == b a = "5TEPHEN HAWK1NG - THE UN1VER5E 1N A NUT5HELL" b = "STEPHEN HAWKING - THE UNIVERSE IN A NUTSHELL" assert correct(a) == b a = "ERNE5T HEM1NGWAY - A FARWELL T0 ARM5" b = "ERNEST HEMINGWAY - A FARWELL TO ARMS" assert correct(a) == b a = "ERNE5T HEM1NGWAY - F0R WH0M THE BELL T0LL5" b = "ERNEST HEMINGWAY - FOR WHOM THE BELL TOLLS" assert correct(a) == b a = "ERNE5T HEM1NGWAY - THE 0LD MAN AND THE 5EA" b = "ERNEST HEMINGWAY - THE OLD MAN AND THE SEA" assert correct(a) == b a = "J. R. R. T0LK1EN - THE L0RD 0F THE R1NG5" b = "J. R. R. TOLKIEN - THE LORD OF THE RINGS" assert correct(a) == b a = "J. D. 5AL1NGER - THE CATCHER 1N THE RYE" b = "J. D. SALINGER - THE CATCHER IN THE RYE" assert correct(a) == b a = "J. K. R0WL1NG - HARRY P0TTER AND THE PH1L050PHER'5 5T0NE" b = "J. K. ROWLING - HARRY POTTER AND THE PHILOSOPHER'S STONE" assert correct(a) == b a = "J. K. R0WL1NG - HARRY P0TTER AND THE CHAMBER 0F 5ECRET5" b = "J. K. ROWLING - HARRY POTTER AND THE CHAMBER OF SECRETS" assert correct(a) == b a = "J. K. R0WL1NG - HARRY P0TTER AND THE PR150NER 0F Azkaban" b = "J. K. ROWLING - HARRY POTTER AND THE PRISONER OF Azkaban" assert correct(a) == b a = "J. K. R0WL1NG - HARRY P0TTER AND THE G0BLET 0F F1RE" b = "J. K. ROWLING - HARRY POTTER AND THE GOBLET OF FIRE" assert correct(a) == b a = "J. K. R0WL1NG - HARRY P0TTER AND THE 0RDER 0F PH0EN1X" b = "J. K. ROWLING - HARRY POTTER AND THE ORDER OF PHOENIX" assert correct(a) == b a = "J. K. R0WL1NG - HARRY P0TTER AND THE HALF-BL00D PR1NCE" b = "J. K. ROWLING - HARRY POTTER AND THE HALF-BLOOD PRINCE" assert correct(a) == b a = "J. K. R0WL1NG - HARRY P0TTER AND THE DEATHLY HALL0W5" b = "J. K. ROWLING - HARRY POTTER AND THE DEATHLY HALLOWS" assert correct(a) == b a = "UR5ULA K. LE GU1N - A W1ZARD 0F EARTH5EA" b = "URSULA K. LE GUIN - A WIZARD OF EARTHSEA" assert correct(a) == b a = "UR5ULA K. LE GU1N - THE T0MB5 0F ATUAN" b = "URSULA K. LE GUIN - THE TOMBS OF ATUAN" assert correct(a) == b a = "UR5ULA K. LE GU1N - THE FARTHE5T 5H0RE" b = "URSULA K. LE GUIN - THE FARTHEST SHORE" assert correct(a) == b a = "UR5ULA K. LE GU1N - TALE5 FR0M EARTH5EA" b = "URSULA K. LE GUIN - TALES FROM EARTHSEA" assert correct(a) == b