def __math_garden_area(plot_surface, build_surface): # If plot_surface and build_surface are not None: if plot_surface and build_surface: # Convert string to int plot_surface = Cleaner.string_to_int(plot_surface.strip()) build_surface = Cleaner.string_to_int(build_surface.strip()) # If plot_surface and build_surface exists: if plot_surface and build_surface: print(plot_surface - build_surface) return plot_surface - build_surface return None
def compile(self): self.token = Cleaner(self.questn).make_final_string() self.location = Geocoder(self.token).get_location() if self.location[0] == "OK": try: self.wikiresult = WikiSearcher(self.location).geolookup() self.wikiurl = WikiSearcher(self.location).get_url() self.finalData = { "status": self.location[0], "lat": self.location[1][0], "long": self.location[1][1], "wikiresult": self.wikiresult, "wikiurl": self.wikiurl, "granpyMessage": random.choice(ok_res), } except IndexError: self.finalData = { "status": "NOK", "warningMessage": random.choice(no_res), } else: self.finalData = { "status": self.location[0], "warningMessage": random.choice(nok_res), } return self.finalData
def search_col_xs_7(title, room_type): """ Search for a given room in a 'col_xs_7' under 'More Info'. Append its number to 'room_number'. """ nonlocal rooms_number result = self.__get_text( self.__scrap_field_value('div', 'col-xs-7 info-name', title)) if result: rooms_number += Cleaner.string_to_int(result, 1) has_found[room_type] = True
def clean(self, tfile): """ clean data Parameters ---------- tfile: string the path of the data needing to be processed Returns ------- new_data: pd.DataFrame """ # load data data = pd.read_csv( tfile, sep='\t', header=None, names=['polarity', 'aspect', 'target', 'startend', 'message']) # clean the data cleaner = Cleaner() new_data = cleaner.remove_punctuation_dataframe(data) new_data = cleaner.remove_digits_dataframe(new_data) new_data = cleaner.lemmatization_dataframe(new_data) new_data = cleaner.lower_case(new_data) return new_data
def start(self): # Grab the urls to scrap manager = Manager() manager.grabber(10) # Print starting message total_urls_number = len(manager.urls) print(f"[+] Scrapping phase started: 0/{total_urls_number}.") scrapped_urls = 0 # Group the urls 10 by 10 grouped_total_urls = self.grouper(manager.urls, self.url_pool_size) with ThreadPoolExecutor(max_workers=self.scrapper_workers) as executor: futures = [ executor.submit(Manager.scrapper, urls) for urls in grouped_total_urls ] for entry in concurrent.futures.as_completed(futures): # Iterate the scrapped_urls and print a status message scrapped_urls += self.url_pool_size print( f"[i] Urls scrapped: {scrapped_urls}/{total_urls_number}.") Cleaner(entry.result()).clean() print( f"[i] Urls scrapped: {total_urls_number}/{total_urls_number} - Complete !" ) # Merge the pickles Merger().merge()
def split(): """ This endpoint expects: 1 - a file with raw OCR text 2 - an email address for the patient/submitter of pathology report :return: """ PRODUCTION_ENDPOINT, TEST_ENDPOINT = get_endpoints() content = request.get_json() text, email = content['text'], content['email'].strip() # Remove PHI de_identified_text = filter_task(text, scispacy) # Clean text cleaner = Cleaner(de_identified_text) cleaned_text = cleaner.text # Finally preprocess preprocessor = SpacyPreProcessor(scispacy) text, tokens_list = preprocessor.preprocess_sentences(cleaned_text) m = {'text': text, 'tokens': tokens_list, 'email': email} response = requests.post(url=TEST_ENDPOINT, json=m) return response.json()
#mongo_cols = {'acct_type','user_type','email_domain','venue_state','venue_name'} client = MongoClient() db = client[client_name] tab = db[tab_name] cursor = tab.find(None) #mongo_cols) df = pd.DataFrame(list(cursor)) return df if __name__ == '__main__': # read data dataframe = get_data() #print(dataframe) #clean data y = dataframe['acct_type'].str.contains('fraud').astype(int) X_train, X_test, y_train, y_test = \ train_test_split(dataframe, y, random_state = 142) print('cleaning....') clean = Cleaner() clean.fit(X_train) X_train = clean.transform(X_train) X_test = clean.transform(X_test) print('Fitting....') #fit model gb = GBModel() gb.fit(X_train, y_train) print('score: {}'.format(gb.score(X_test, y_test)))
def test_get_question(): sut = Cleaner("Granpy quelle est l'adresse de la Poste?") assert sut.question == "Granpy quelle est l'adresse de la Poste?" """ Turn question into all lowercases with lowercase method
def test_final_string(): sut = Cleaner("Granpy quelle est l'adresse de la Tour Eiffel?") assert sut.make_final_string() == "tour+eiffel"
def test_concatenate(): sut = Cleaner("Granpy quelle est l'adresse de la Tour Eiffel?") assert sut.make_final_string() == "tour+eiffel" """ Returns the final string """
def test_token(): sut = Cleaner("Granpy quelle est l'adresse de la Poste?") assert sut.make_final_string() == "poste" """ Takes words from wordlist and adds a + in between """
def test_turn_string_to_wordlist(): sut = Cleaner("Granpy quelle est l'adresse de la Tour Eiffel?") assert sut.make_final_string() == "tour+eiffel" """ Clean worldlist from unwanted commun words with stopword method """
def test_turn_lowercase(): sut = Cleaner("Granpy quelle est l'adresse de la POSTE?") assert sut.make_final_string() == "poste" """ Turn question (string) in a world list with wordlist method