def getreportdata(self, report_id, file_id, sanitize_rows=True, chunk_size=2**20 * 30): """Get actual report data. Retrieve a report data. You may invoke this method after the report has been executed. Args: report_id: Report ID file_id: ID of the file to be retrieved (each report might be executed several times, each will yield a new file ID) sanitize_rows: Whether to remove commas, quotes and newlines from the report's rows or not chunk_size: Download chunk size Returns: List with all the rows in the report except for the last one (which corresponds to totals) """ dataio = self.__getreportdataraw(report_id, file_id, chunk_size) reader = csv.reader(dataio) data = list() for row in reader: if row and row[0] == 'Report Fields': break for row in reader: temp_row = row if sanitize_rows: temp_row = TextUtils.removecommas(temp_row) temp_row = TextUtils.removequotes(temp_row) temp_row = TextUtils.removenewlines(temp_row) data.append(temp_row) return data[0:-1]
def getquerydata(self, query_id, sanitize_rows=True, remove_last_row=True): """Get query data. Download query (report) actual data. Args: query_id: ID of the report to download sanitize_rows: Remove commas, quotes and new lines from report lines remove_last_row: Remove last row (typically totals will be in this row) Returns: List with lines in the report """ dataio = self.__getquerydataraw(query_id) data = list() if dataio.len > 0: reader = csv.reader(dataio) for row in reader: if not row: break temp_row = row if sanitize_rows: temp_row = TextUtils.removecommas(temp_row) temp_row = TextUtils.removequotes(temp_row) temp_row = TextUtils.removenewlines(temp_row) data.append(temp_row) logging.debug('Report data retrieved. Number of lines: %s', len(data)) # We remove the last row (with totals) only if there's more than one entry # (or totals will not be there) if (remove_last_row) and (len(data) > 2): return data[:-1] else: return data
def sdfdownloadadgroup(self, advertiser_id, sanitize_rows=True): """Download Ad Groups in SDF format. Args: advertiser_id: DBM advertiser ID sanitize_rows: Whether to remove commas, quotes and new lines from each row Returns: List with rows, one per Ad Group """ body = { 'fileTypes': ['AD_GROUP'], 'filterType': 'ADVERTISER_ID', 'filterIds': [] } body['filterIds'].append(advertiser_id) request = self.__api.sdf().download(body=body) sdfdata = APIRequest(request).execute() data = list() dataio = TextUtils.toascii(sdfdata['adGroups']) if dataio: reader = csv.reader(StringIO(dataio)) for row in reader: if not row: break temp_row = row if sanitize_rows: temp_row = TextUtils.removecommas(temp_row) temp_row = TextUtils.removequotes(temp_row) temp_row = TextUtils.removenewlines(temp_row) data.append(temp_row) return data
def main(argv): """Main function reading the task and launching the corresponding ETL job. Args: argv: array of parameters: (1) queue name, (2) task id. """ # Get input arguments passed by the initial shell script. queue_name = str(argv[1]) task_name = str(argv[2]) # Initiate connectors for Google Cloud Platform (and DCM/DBM/DS as needed). gcp = GCPConnector(PROJECT_ID) # Get the first available task from the queue. task = gcp.gct_gettask(task_name) payload = task['pullMessage']['payload'] params = json.loads(base64.urlsafe_b64decode(str(payload))) # Add service-specific params. params['schema'] = DATA_SCHEMA params['filename'] = TextUtils.timestamp() + '_' + str( params['account_id']) + '.csv' params['dataset'] = GBQ_DATASET params['table'] = GBQ_TABLE params['append'] = True # Log run info as Datastore entity. run_entity = gcp.gds_insert(kind=GDS_KIND_LOG_SERVICE, attributes={ 'created': TextUtils.timestamp().decode(), 'service': params['service'].decode(), 'status': u'RUNNING', 'error': None, 'bqjob': None, 'bqstatus': None, }) try: # Run the ETL task and updates the Datastore entity status. job_id = service_task(gcp, params) run_entity['bqjob'] = job_id.decode() run_entity['bqstatus'] = u'RUNNING' run_entity['status'] = u'DONE' # pylint: disable=broad-except except Exception as e: run_entity['status'] = u'FAILED' run_entity['error'] = str(e).decode() logger.error( '[%s] - The following error occurs while executing task <%s> : <%s>', SERVICE_NAME, task_name, str(e)) finally: gcp.gds_update(run_entity)
def get_team_name_and_score(soup, top_list_item_class_name): TEAM_NAME_CLASS_NAME = "cscore_team icon-font-after" TEAM_SPAN_CLASS_NAME = "cscore_name cscore_name--long" SCORE_CLASS_NAME = "cscore_score" team = soup.find("li", { "class": top_list_item_class_name }).find("div", TEAM_NAME_CLASS_NAME) team_name = team.a.find("span", {"class": TEAM_SPAN_CLASS_NAME}).text team_score = team.find("div", {"class": SCORE_CLASS_NAME}).text team_name = TextUtils.replaceQuotesInText(team_name) team_score = TextUtils.replaceQuotesInText(team_score) return team_name, team_score
def parse_table_to_2d_dict(table): rs_dict = {} row_index = 0 is_head_two_rowspan, is_head = False, True for tr in table.find_all('tr'): col_index, cur_col_index = 0, 0 for td in tr.find_all('td'): rowspan = td.get('rowspan') rowspan = int(rowspan) if (rowspan is not None and int(rowspan) > 1) else 1 colspan = td.get('colspan') colspan = int(colspan) if (colspan is not None and int(colspan) > 1) else 1 if is_head: if rowspan > 1 or colspan > 1: is_head_two_rowspan = True is_head = False for r in range(rowspan): if (row_index + r) not in rs_dict: rs_dict[row_index + r] = {} for c in range(colspan): cur_col_index = col_index while cur_col_index in rs_dict[row_index + r]: cur_col_index += 1 rs_dict[ row_index + r][cur_col_index] = TextUtils.remove_blank_chars( td.text) cur_col_index += 1 col_index = cur_col_index row_index += 1 return rs_dict, is_head_two_rowspan
def parse_content(self, html_file_path): """ 解析 HTML 中的段落文本 按顺序返回多个 paragraph 构成一个数组, 每个 paragraph 是一个 content 行构成的数组 :param html_file_path: :return: """ rs = [] with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp: soup = BeautifulSoup(fp.read(), "html.parser") paragraphs = [] for div in soup.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': paragraphs.append(div) for paragraph_div in paragraphs: has_sub_paragraph = False for div in paragraph_div.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': has_sub_paragraph = True if has_sub_paragraph: continue rs.append([]) for content_div in paragraph_div.find_all('div'): div_type = content_div.get('type') if div_type is not None and div_type == 'content': rs[-1].append(TextUtils.clean_text(content_div.text)) paragraphs = [] for content_list in rs: if len(content_list) > 0: paragraphs.append(''.join(content_list)) return paragraphs
def parse_table_to_2d_dict(table): rs_dict = {} row_index = 0 is_head_two_rowspan, is_head = False, True for tr in table.find_all('tr'): #tr为表格的一行 col_index, cur_col_index = 0, 0 for td in tr.find_all('td'): #查找每一行中每个单元中的数据 rowspan = td.get('rowspan') #列方向的单元跨越行数(所占行数),即一个单元格的元素对应多行数据 rowspan = int(rowspan) if (rowspan is not None and int(rowspan) > 1) else 1 colspan = td.get('colspan') #行方向的单元跨越行数(所占列数) colspan = int(colspan) if (colspan is not None and int(colspan) > 1) else 1 if is_head: if rowspan > 1 or colspan > 1: #is_head_two_rowspan和is_head具体有什么意义? is_head_two_rowspan = True is_head = False for r in range(rowspan): if (row_index + r) not in rs_dict: #每一行创建一个字典存储一行的信息 rs_dict[row_index + r] = {} for c in range(colspan): cur_col_index = col_index while cur_col_index in rs_dict[row_index + r]: cur_col_index += 1 #将信息添加到每一行的字典中,键值对为列索引和具体的文本 rs_dict[ row_index + r][cur_col_index] = TextUtils.remove_blank_chars( td.text) #这里是双重字典 cur_col_index += 1 col_index = cur_col_index row_index += 1 return rs_dict, is_head_two_rowspan
def normalize_num(text): ''' 将数字转换为标准格式 normalize 子例程 ''' coeff = 1.0 if '亿' in text: coeff *= 100000000 if '万' in text: coeff *= 10000 if '千' in text or '仟' in text: coeff *= 1000 if '百' in text or '佰' in text: coeff *= 100 if '%' in text: coeff *= 0.01 try: number = float(TextUtils.extract_number(text)) number_text = '%.4f' % (number * coeff) if '.' in number_text: idx = len(number_text) while idx > 1 and number_text[idx - 1] == '0': idx -= 1 if number_text[idx - 1] == '.': number_text = number_text[:idx - 1] else: number_text = number_text[:idx] return number_text except: return text
def normalize_num(self, text): coeff = 1.0 if '亿' in text: coeff *= 100000000 if '万' in text: coeff *= 10000 if '千' in text or '仟' in text: coeff *= 1000 if '百' in text or '佰' in text: coeff *= 100 if '%' in text: coeff *= 0.01 try: number = float(TextUtils.extract_number(text)) number_text = '%.4f' % (number * coeff) if number_text.endswith('.0'): return number_text[:-2] elif number_text.endswith('.00'): return number_text[:-3] elif number_text.endswith('.000'): return number_text[:-4] elif number_text.endswith('.0000'): return number_text[:-5] else: if '.' in number_text: idx = len(number_text) while idx > 1 and number_text[idx - 1] == '0': idx -= 1 number_text = number_text[:idx] return number_text except: return text
def main(): try: gcp = GCPConnector(PROJECT_ID) # This is a basic input configuration object - you might want to use a # different approach (e.g. input fields in a Spreadshet) to allow a more # flexible configuration. config_data = [['account1', 'Account Number 1'], ['account2', 'Account Number 2']] for row in config_data: # Add params to be passed via task payload task_params = dict() task_params['service'] = SERVICE_NAME # Mandatory field task_params['run_script'] = GCE_RUN_SCRIPT # Mandatory field task_params['bucket'] = GCS_BUCKET task_params['dataset'] = GBQ_DATASET # And add service-specific params task_params['account_id'] = row[0] task_params['label'] = row[1] task_params['schema'] = DATA_SCHEMA task_params['filename'] = TextUtils.timestamp() + '_' + str( task_params['account_id']) + '.csv' task_params['table'] = GBQ_TABLE task_params['append'] = True service_template_run.service_task(gcp, task_params) # pylint: disable=broad-except except Exception as e: print e.message
def getreportdata(self, report_id, sanitize_rows=True): """Get report data. Download report data, once executed. This methods blocks until report has finished executing. Args: report_id: ID of the report to be downloaded sanitize_rows: Whether to remove commas and quotes from the rows in the report or not Returns: List with all the rows in the report """ dataio = self.__getreportdataraw(report_id) reader = csv.reader(dataio) data = list() for row in reader: temp_row = row if sanitize_rows: temp_row = TextUtils.removecommas(temp_row) temp_row = TextUtils.removequotes(temp_row) data.append(temp_row) return data
def count(self, target): """Target text count in document""" if not target: return 0 index = self._char_pos_map[target[0]] if len(index) == 0: return 0 if len(target) == 1: return len(index) count = 0 for pos in index: if TextUtils.match(self._document, pos, target): count += 1 return count
def find(self, target): """All target text matching start index in document Yields: index_list """ if not target: yield 0 # 获取候选词的第一个字符出现在文本中的所有下标 index = self._char_pos_map[target[0]] if len(index) == 0: yield 0 if len(target) == 1: for pos in index: yield pos for pos in index: if TextUtils.match(self._document, pos, target): yield pos
def main(): """ Start the Slack Client """ os.system("clear; figlet 'Slack Gitsin' | lolcat") history = FileHistory(os.path.expanduser("~/.slackHistory")) while True: text = prompt("slack> ", history=history, auto_suggest=AutoSuggestFromHistory(), on_abort=AbortAction.RETRY, style=DocumentStyle, completer=Completer(fuzzy_match=False, text_utils=TextUtils()), complete_while_typing=Always(), get_bottom_toolbar_tokens=get_bottom_toolbar_tokens, key_bindings_registry=manager.registry, accept_action=AcceptAction.RETURN_DOCUMENT ) slack = Slack(text) slack.run_command()
def main(): try: gcp = GCPConnector(PROJECT_ID) dcm = DCMConnector(credential_file=CREDENTIAL_FILE, user_email=None, profile_id=DCM_PROFILE_ID, api_version=DCM_API_VER) # In this example, we're mocking the config parameters (DCM partner and # advertiser IDs respectively): config_data = [['1234', '1111111'], ['5678', '2222222']] for row in config_data: # Add params to be passed via task payload task_params = dict() task_params['service'] = SERVICE_NAME # Mandatory field task_params['run_script'] = GCE_RUN_SCRIPT # Mandatory field task_params['account_id'] = row[0] task_params['advertiser_id'] = row[1] task_params['bucket'] = GCS_BUCKET task_params['dataset'] = GBQ_DATASET # And add service-specific params task_params['report_template'] = DCM_REPORT_TEMPLATE task_params['report_name'] = DCM_REPORT_NAME task_params['date_range'] = DCM_REPORT_DATE_RANGE task_params['schema'] = DATA_SCHEMA_STANDARD task_params['filename'] = TextUtils.timestamp() + '_' + str( task_params['account_id']) + '.csv' task_params['table'] = GBQ_TABLE task_params['append'] = True service_example_run.service_task(dcm, gcp, task_params) # pylint: disable=broad-except except Exception as e: print e.message
def parse_content(self, html_file_path): """ 解析 HTML 中的段落文本 按顺序返回多个 paragraph 构成一个数组, 每个 paragraph 是一个 content 行构成的数组 :param html_file_path: :return: """ rs = [] with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp: soup = BeautifulSoup(fp.read(), "html.parser") paragraphs = [] for div in soup.find_all('div'): div_type = div.get('type') #添加div_type == 'paragraph'的div块中的文本 if div_type is not None and div_type == 'paragraph': paragraphs.append(div) for paragraph_div in paragraphs: has_sub_paragraph = False #判断paragraph中是否有子paragraph for div in paragraph_div.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': has_sub_paragraph = True if has_sub_paragraph: continue #若存在子paragraph则continue,因为后面会遍历到该paragraph rs.append([]) #每个paragraphs中的content保存在rs的子列表中 #将paragraph中的content添加到列表中 for content_div in paragraph_div.find_all('div'): div_type = content_div.get('type') if div_type is not None and div_type == 'content': rs[-1].append(TextUtils.clean_text(content_div.text)) paragraphs = [] for content_list in rs: if len(content_list) > 0: paragraphs.append( ''.join(content_list)) #每个content_list结合在一起成为一个字符串 return paragraphs
def get_character_aspects(self, char=None): char = char if char else self available = self.get_invokable_objects(char) aspects_strings = [] for obj in available: if obj['char'].high_concept: name = TextUtils.clean(obj['char'].name) high_concept = TextUtils.clean(obj['char'].high_concept) aspects_strings.append( f'***{high_concept}*** (High Concept of _\'{name}\'_)') if obj['char'].trouble: name = TextUtils.clean(obj['char'].name) trouble = TextUtils.clean(obj['char'].trouble) aspects_strings.append( f'***{trouble}*** (Trouble of _\'{name}\'_)') if obj.category in ['Aspect', 'Stunt']: name = TextUtils.clean(obj['char'].name) category = TextUtils.clean(obj['char'].category) + ( ' _(Boost)_ ' if self.is_boost else '') parent = TextUtils.clean(obj['parent'].name) aspects_strings.append( f'***{name}*** ({category} of _\'{parent}\'_)') return aspects_strings
def task_manager(): """Task manager function. Looks for tasks in the DNA queues and launches CE instances accordingly. Returns: Standard 'OK' string to confirm completed execution. """ gcp = GCPConnector(PROJECT_ID) for level in ['l0', 'l1', 'l2', 'l3']: zone = GCE_MACHINE_MAP[level]['zone'] queue = GCE_MACHINE_MAP[level]['queue'] quota = GCE_MACHINE_MAP[level]['quota'] vm_type = GCE_MACHINE_MAP[level]['type'] # Retrieve the list of existing instances if any num_running_ce = 0 ce_list = gcp.gce_listinstances(zone) if 'items' in ce_list: for item in ce_list['items']: starting_idx = item['machineType'].find('machineTypes/') mtype = item['machineType'][starting_idx + 13:] if mtype == vm_type: num_running_ce += 1 # Check how many tasks are in the queue and calculate the number of CE # machines to be created task_list = gcp.gct_listtasks(queue) logging.debug('%s elements in queue [%s]', len(task_list), queue) if 'tasks' in task_list: num_tasks = len(task_list['tasks']) pool_size = quota - num_running_ce pool_size = min(pool_size, num_tasks) else: logging.debug('No \'tasks\' in Cloud Task queue') # No tasks in the queue pool_size = 0 logging.debug('Level: [%s]. Pool size: %s', level, pool_size) # Create a pool of CE instances if pool_size>0 if pool_size > 0: for i in range(pool_size): machine_id = '%s-%s' % (level, str(i)) instance_name = 'dna-machine-%s-%s' % (machine_id, str(uuid.uuid1().hex)) logging.debug('Configuring new machine. ID: [%s]. Instance name: [%s]', machine_id, instance_name) # Insert a new Datastore entry for each CE instance ce_entity = gcp.gds_insert( kind=GDS_KIND_CE_INSTANCE, attributes={ 'name': instance_name, 'zone': zone, 'created': TextUtils.timestamp(), 't0': time.time(), 'status': None }) # Get the basic configuration as defined in the GCPConnector class ce_config = gcp.gce_configinstance( name=instance_name, zone=zone, machine_type=vm_type, service_account=GCE_SERVICE_ACCOUNT, scopes=GCE_SCOPES) # Add some metadata ce_config = gcp.gce_addmetadata( config=ce_config, key='startup-script-url', value=GCE_STARTUP_SCRIPT) ce_config = gcp.gce_addmetadata( config=ce_config, key='shutdown-script-url', value=GCE_SHUTDOWN_SCRIPT) ce_config = gcp.gce_addmetadata( config=ce_config, key='machine-id', value=machine_id) ce_config = gcp.gce_addmetadata( config=ce_config, key='ce-entity-id', value=str(ce_entity.key.id)) ce_config = gcp.gce_addmetadata( config=ce_config, key='project-root', value=GCS_PROJECT_ROOT) ce_config = gcp.gce_addmetadata( config=ce_config, key='level', value=level) # Create the instance gcp.gce_createinstance(ce_config) logging.debug('Instance created. Name: [%s]', instance_name) # Update the status of the corresponding Datastore entity ce_entity['status'] = DNA_STATUS_CREATED gcp.gds_update(ce_entity) logging.debug('CE DataStore entity updated') return 'OK'
def getLongFromText(text): return TextUtils.remove_comma_in_number(text)
from selenium import webdriver import mysql.connector from utils import TextUtils def avatar_url_from_vid(vid): return "https://i.ytimg.com/vi/%s/default_live.jpg" % vid def cover_img_url_from_vid(vid): return "https://i.ytimg.com/vi/%s/maxresdefault.jpg" % vid good_video = [ lambda v: not TextUtils.is_blank(v['title']), lambda v: TextUtils.is_plain_text(v['title']), ] def parse_video_web_url(u): up = urlparse.urlparse(u) filtered_fields = ('index', 'list') new_query = {} for k, v in urlparse.parse_qs(up.query, keep_blank_values=True, strict_parsing=True).items(): if k not in filtered_fields: new_query[k] = v[0] if isinstance(v, list) else v return urlparse.urlunparse((up.scheme, up.netloc, up.path, up.params,
def tokenize(self, s: str) -> Sequence: s = TextUtils.normalize_newlines(s) return seq(re.split(r"\n{2,}", s))
def tokenize(self, s: str) -> Sequence: s = TextUtils.normalize_newlines(s) return seq(re.split(r"\n", s)).filter(TextUtils.has_content)
def get_commentary(soup): global last_comment global has_updates root_div_tags_children = soup.find( "article", {"class": "sub-module match-commentary cricket"}) if (root_div_tags_children is None): root_div_tags_children = soup.find( "article", {"class": "sub-module match-commentary cricket add-padding"}) if (root_div_tags_children is None): LOGGER.error_with_time("Couldn't find article class. Aborting.") exit(1) root_div_tags_children = root_div_tags_children.find( "div", {"class": "content"}) if (root_div_tags_children is None): LOGGER.error_with_time( "Couldn't find div for root_div_tags_children. Aborting.") exit(1) commentary = [] for commentary_item in root_div_tags_children: over = commentary_item.find("div", {"class": "time-stamp"}) description = commentary_item.find("div", {"class": "description"}) if (over is None): over = "" else: over = TextUtils.replaceQuotesInText(over.text) if (description is None or properties.IS_TEST_MODE): description = "" else: description = TextUtils.replaceQuotesInText(description.text) comment = Comment(over, description) paragraphs = commentary_item.findAll("p", {"class": "comment"}) if (paragraphs is None): paragraphs = [] if not properties.IS_TEST_MODE: for p in paragraphs: p = TextUtils.replaceQuotesInText(p.text) comment.add_paragraph(p) if (len(over) != 0 or len(description) != 0 or len(comment.paragraphs) != 0): commentary.append(comment) commentary.reverse() ind = -1 for i in range(len(commentary)): if (commentary[i].over == last_comment.over): ind = i break if (ind >= 0 and (commentary[ind].description != last_comment.description or commentary[ind].paragraphs != last_comment.paragraphs)): ind -= 1 commentary = commentary[(ind + 1):len(commentary)] if (len(commentary) > 0): last_comment = commentary[-1] else: has_updates = False return commentary
import pandas as pd import tensorflow as tf from nn.similarNN import Model import _pickle as cPickle from utils import TextUtils save_model_class = './saved/24072018/model.pkl' save_model_deep = './saved/24072018/model.ckpt' data_file_path = './data/cikm_test_a_20180516.txt' data_file_headers = ['spa_sent_1', 'spa_sent_2'] if __name__ == '__main__': text_util = TextUtils() text_util.pad_id = 1 text_util.unk_id = 0 """ Restore model """ model = cPickle.load(open(save_model_class, 'rb')) model.build(build_session=True, init_word_embedding=None) model.restore(save_model_deep) """ Load data """ data_df = pd.read_csv(data_file_path, sep='\t', header=None, names=data_file_headers) """
def main(argv): """Main function reading the task and launching the corresponding ETL job. Args: argv: array of parameters: (1) queue name, (2) task id. """ # Get input arguments passed by the service-example-run.sh script queue_name = str(argv[1]) task_name = str(argv[2]) logger.info('Starting service-example processing task. Queue name: [%s]. ' 'Task name: [%s]', queue_name, task_name) # Initiate connectors for Google Cloud Platform and DCM. gcp = GCPConnector(PROJECT_ID) dcm = DCMConnector( credential_file=CREDENTIAL_FILE, user_email=None, profile_id=DCM_PROFILE_ID, api_version=DCM_API_VER) # Get the first available task from the queue. task = gcp.gct_gettask(task_name) payload = task['pullMessage']['payload'] params = json.loads(base64.urlsafe_b64decode(str(payload))) # Add service-specific params. params['report_template'] = DCM_REPORT_TEMPLATE params['report_name'] = DCM_REPORT_NAME params['date_range'] = DCM_REPORT_DATE_RANGE params['schema'] = DATA_SCHEMA_STANDARD params['filename'] = TextUtils.timestamp() + '_' + str( params['account_id']) + '.csv' params['table'] = GBQ_TABLE params['append'] = False # Log run info as Datastore entity. run_entity = gcp.gds_insert( kind=GDS_KIND_LOG_SERVICE, attributes={ 'created': TextUtils.timestamp().decode(), 'service': params['service'].decode(), 'status': u'RUNNING', 'error': None, 'bqjob': None, 'bqstatus': None, }) try: # Run the ETL task with the given params and update the Datastore entity. job_id = service_task(dcm, gcp, params) run_entity['bqjob'] = job_id.decode() run_entity['bqstatus'] = u'RUNNING' run_entity['status'] = u'DONE' # pylint: disable=broad-except except Exception as e: run_entity['status'] = u'FAILED' run_entity['error'] = str(e).decode() logger.error( '[%s] - The following error occurs while executing task <%s> : <%s>', SERVICE_NAME, task_name, str(e)) finally: gcp.gds_update(run_entity)
def get_long_from_text(text): return TextUtils.remove_comma_in_number(text)
def make_fold(train_df, test_df, save_model_class, save_model_deep): text_util = TextUtils() # preprocessing and tokenize train_spa_sent_1_df = train_df['spa_sent_1'].tolist() train_spa_sent_2_df = train_df['spa_sent_2'].tolist() test_spa_sent_1_df = test_df['spa_sent_1'].tolist() test_spa_sent_2_df = test_df['spa_sent_2'].tolist() train_spa_tokens_1 = text_util.tokenize(sentences=train_spa_sent_1_df, language=text_util.spanish) train_spa_tokens_2 = text_util.tokenize(sentences=train_spa_sent_2_df, language=text_util.spanish) test_spa_tokens_1 = text_util.tokenize(sentences=test_spa_sent_1_df, language=text_util.spanish) test_spa_tokens_2 = text_util.tokenize(sentences=test_spa_sent_2_df, language=text_util.spanish) # building vocabulary (#using only training dataset) train_spa_tokens = train_spa_tokens_1 + train_spa_tokens_2 train_label_df = train_df['label'].tolist() (spa_id2word, spa_word2id), spa_E_by_id = text_util.create_word_vocab( lst_tokens=train_spa_tokens, word_dim=300, fasttext_path='./data/new/pretrained/mine.wiki.es.vec') (id2label, label2id) = text_util.create_label_vocab(labels=train_label_df) # builing dataset (mean convert token, label to its corressponding id) train_dataset = text_util.create_dataset(lst_tokens_1=train_spa_tokens_1, lst_tokens_2=train_spa_tokens_2, labels=train_label_df, label2id=label2id, word2id_1=spa_word2id, word2id_2=spa_word2id) test_dataset = text_util.create_dataset(lst_tokens_1=test_spa_tokens_1, lst_tokens_2=test_spa_tokens_2, labels=test_df['label'].tolist(), label2id=label2id, word2id_1=spa_word2id, word2id_2=spa_word2id) # create batch train_batches = text_util.create_batch(dataset=train_dataset, batch_size=batch_size) test_batches = text_util.create_batch(dataset=test_dataset, batch_size=batch_size) # training train_score = train(train_batchs=train_batches, test_batchs=test_batches, n_epoch=n_epoch, init_lr=init_lr, init_keep_prob=init_keep_prob, init_word_emb=spa_E_by_id, text_util=text_util, save_model_class=save_model_class, save_model_deep=save_model_deep, word2id=spa_word2id, label2id=label2id) return train_score