Beispiel #1
0
    def getreportdata(self,
                      report_id,
                      file_id,
                      sanitize_rows=True,
                      chunk_size=2**20 * 30):
        """Get actual report data.

    Retrieve a report data. You may invoke this method after the
    report has been executed.

    Args:
      report_id: Report ID
      file_id: ID of the file to be retrieved (each report might be executed
        several times, each will yield a new file ID)
      sanitize_rows: Whether to remove commas, quotes and newlines from the
        report's rows or not
      chunk_size: Download chunk size
    Returns:
      List with all the rows in the report except for the last one (which
      corresponds to totals)
    """
        dataio = self.__getreportdataraw(report_id, file_id, chunk_size)
        reader = csv.reader(dataio)
        data = list()
        for row in reader:
            if row and row[0] == 'Report Fields':
                break
        for row in reader:
            temp_row = row
            if sanitize_rows:
                temp_row = TextUtils.removecommas(temp_row)
                temp_row = TextUtils.removequotes(temp_row)
                temp_row = TextUtils.removenewlines(temp_row)
            data.append(temp_row)
        return data[0:-1]
Beispiel #2
0
  def getquerydata(self, query_id, sanitize_rows=True, remove_last_row=True):
    """Get query data.

    Download query (report) actual data.

    Args:
      query_id: ID of the report to download
      sanitize_rows: Remove commas, quotes and new lines from report lines
      remove_last_row: Remove last row (typically totals will be in this row)
    Returns:
      List with lines in the report
    """
    dataio = self.__getquerydataraw(query_id)
    data = list()
    if dataio.len > 0:
      reader = csv.reader(dataio)
      for row in reader:
        if not row:
          break
        temp_row = row
        if sanitize_rows:
          temp_row = TextUtils.removecommas(temp_row)
          temp_row = TextUtils.removequotes(temp_row)
          temp_row = TextUtils.removenewlines(temp_row)
        data.append(temp_row)
    logging.debug('Report data retrieved. Number of lines: %s', len(data))
    # We remove the last row (with totals) only if there's more than one entry
    # (or totals will not be there)
    if (remove_last_row) and (len(data) > 2):
      return data[:-1]
    else:
      return data
Beispiel #3
0
  def sdfdownloadadgroup(self, advertiser_id, sanitize_rows=True):
    """Download Ad Groups in SDF format.

    Args:
      advertiser_id: DBM advertiser ID
      sanitize_rows: Whether to remove commas, quotes and new lines from each
        row
    Returns:
      List with rows, one per Ad Group
    """
    body = {
        'fileTypes': ['AD_GROUP'],
        'filterType': 'ADVERTISER_ID',
        'filterIds': []
    }
    body['filterIds'].append(advertiser_id)
    request = self.__api.sdf().download(body=body)
    sdfdata = APIRequest(request).execute()
    data = list()
    dataio = TextUtils.toascii(sdfdata['adGroups'])
    if dataio:
      reader = csv.reader(StringIO(dataio))

      for row in reader:
        if not row:
          break
        temp_row = row
        if sanitize_rows:
          temp_row = TextUtils.removecommas(temp_row)
          temp_row = TextUtils.removequotes(temp_row)
          temp_row = TextUtils.removenewlines(temp_row)
        data.append(temp_row)
    return data
Beispiel #4
0
def main(argv):
    """Main function reading the task and launching the corresponding ETL job.

  Args:
    argv: array of parameters: (1) queue name, (2) task id.
  """

    # Get input arguments passed by the initial shell script.
    queue_name = str(argv[1])
    task_name = str(argv[2])

    # Initiate connectors for Google Cloud Platform (and DCM/DBM/DS as needed).
    gcp = GCPConnector(PROJECT_ID)

    # Get the first available task from the queue.
    task = gcp.gct_gettask(task_name)
    payload = task['pullMessage']['payload']
    params = json.loads(base64.urlsafe_b64decode(str(payload)))

    # Add service-specific params.
    params['schema'] = DATA_SCHEMA
    params['filename'] = TextUtils.timestamp() + '_' + str(
        params['account_id']) + '.csv'
    params['dataset'] = GBQ_DATASET
    params['table'] = GBQ_TABLE
    params['append'] = True

    # Log run info as Datastore entity.
    run_entity = gcp.gds_insert(kind=GDS_KIND_LOG_SERVICE,
                                attributes={
                                    'created': TextUtils.timestamp().decode(),
                                    'service': params['service'].decode(),
                                    'status': u'RUNNING',
                                    'error': None,
                                    'bqjob': None,
                                    'bqstatus': None,
                                })
    try:
        # Run the ETL task and updates the Datastore entity status.
        job_id = service_task(gcp, params)
        run_entity['bqjob'] = job_id.decode()
        run_entity['bqstatus'] = u'RUNNING'
        run_entity['status'] = u'DONE'

    # pylint: disable=broad-except
    except Exception as e:
        run_entity['status'] = u'FAILED'
        run_entity['error'] = str(e).decode()
        logger.error(
            '[%s] - The following error occurs while executing task <%s> : <%s>',
            SERVICE_NAME, task_name, str(e))
    finally:
        gcp.gds_update(run_entity)
def get_team_name_and_score(soup, top_list_item_class_name):
    TEAM_NAME_CLASS_NAME = "cscore_team icon-font-after"
    TEAM_SPAN_CLASS_NAME = "cscore_name cscore_name--long"
    SCORE_CLASS_NAME = "cscore_score"

    team = soup.find("li", {
        "class": top_list_item_class_name
    }).find("div", TEAM_NAME_CLASS_NAME)
    team_name = team.a.find("span", {"class": TEAM_SPAN_CLASS_NAME}).text
    team_score = team.find("div", {"class": SCORE_CLASS_NAME}).text
    team_name = TextUtils.replaceQuotesInText(team_name)
    team_score = TextUtils.replaceQuotesInText(team_score)

    return team_name, team_score
 def parse_table_to_2d_dict(table):
     rs_dict = {}
     row_index = 0
     is_head_two_rowspan, is_head = False, True
     for tr in table.find_all('tr'):
         col_index, cur_col_index = 0, 0
         for td in tr.find_all('td'):
             rowspan = td.get('rowspan')
             rowspan = int(rowspan) if (rowspan is not None
                                        and int(rowspan) > 1) else 1
             colspan = td.get('colspan')
             colspan = int(colspan) if (colspan is not None
                                        and int(colspan) > 1) else 1
             if is_head:
                 if rowspan > 1 or colspan > 1:
                     is_head_two_rowspan = True
                 is_head = False
             for r in range(rowspan):
                 if (row_index + r) not in rs_dict:
                     rs_dict[row_index + r] = {}
                 for c in range(colspan):
                     cur_col_index = col_index
                     while cur_col_index in rs_dict[row_index + r]:
                         cur_col_index += 1
                     rs_dict[
                         row_index +
                         r][cur_col_index] = TextUtils.remove_blank_chars(
                             td.text)
                     cur_col_index += 1
             col_index = cur_col_index
         row_index += 1
     return rs_dict, is_head_two_rowspan
 def parse_content(self, html_file_path):
     """
     解析 HTML 中的段落文本
     按顺序返回多个 paragraph 构成一个数组,
     每个 paragraph 是一个 content 行构成的数组
     :param html_file_path:
     :return:
     """
     rs = []
     with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp:
         soup = BeautifulSoup(fp.read(), "html.parser")
         paragraphs = []
         for div in soup.find_all('div'):
             div_type = div.get('type')
             if div_type is not None and div_type == 'paragraph':
                 paragraphs.append(div)
         for paragraph_div in paragraphs:
             has_sub_paragraph = False
             for div in paragraph_div.find_all('div'):
                 div_type = div.get('type')
                 if div_type is not None and div_type == 'paragraph':
                     has_sub_paragraph = True
             if has_sub_paragraph:
                 continue
             rs.append([])
             for content_div in paragraph_div.find_all('div'):
                 div_type = content_div.get('type')
                 if div_type is not None and div_type == 'content':
                     rs[-1].append(TextUtils.clean_text(content_div.text))
     paragraphs = []
     for content_list in rs:
         if len(content_list) > 0:
             paragraphs.append(''.join(content_list))
     return paragraphs
Beispiel #8
0
 def parse_table_to_2d_dict(table):
     rs_dict = {}
     row_index = 0
     is_head_two_rowspan, is_head = False, True
     for tr in table.find_all('tr'):  #tr为表格的一行
         col_index, cur_col_index = 0, 0
         for td in tr.find_all('td'):  #查找每一行中每个单元中的数据
             rowspan = td.get('rowspan')  #列方向的单元跨越行数(所占行数),即一个单元格的元素对应多行数据
             rowspan = int(rowspan) if (rowspan is not None
                                        and int(rowspan) > 1) else 1
             colspan = td.get('colspan')  #行方向的单元跨越行数(所占列数)
             colspan = int(colspan) if (colspan is not None
                                        and int(colspan) > 1) else 1
             if is_head:
                 if rowspan > 1 or colspan > 1:  #is_head_two_rowspan和is_head具体有什么意义?
                     is_head_two_rowspan = True
                 is_head = False
             for r in range(rowspan):
                 if (row_index + r) not in rs_dict:  #每一行创建一个字典存储一行的信息
                     rs_dict[row_index + r] = {}
                 for c in range(colspan):
                     cur_col_index = col_index
                     while cur_col_index in rs_dict[row_index + r]:
                         cur_col_index += 1
                     #将信息添加到每一行的字典中,键值对为列索引和具体的文本
                     rs_dict[
                         row_index +
                         r][cur_col_index] = TextUtils.remove_blank_chars(
                             td.text)  #这里是双重字典
                     cur_col_index += 1
             col_index = cur_col_index
         row_index += 1
     return rs_dict, is_head_two_rowspan
 def normalize_num(text):
     '''
     将数字转换为标准格式
     normalize 子例程
     '''
     coeff = 1.0
     if '亿' in text:
         coeff *= 100000000
     if '万' in text:
         coeff *= 10000
     if '千' in text or '仟' in text:
         coeff *= 1000
     if '百' in text or '佰' in text:
         coeff *= 100
     if '%' in text:
         coeff *= 0.01
     try:
         number = float(TextUtils.extract_number(text))
         number_text = '%.4f' % (number * coeff)
         if '.' in number_text:
             idx = len(number_text)
             while idx > 1 and number_text[idx - 1] == '0':
                 idx -= 1
             if number_text[idx - 1] == '.':
                 number_text = number_text[:idx - 1]
             else:
                 number_text = number_text[:idx]
         return number_text
     except:
         return text
 def normalize_num(self, text):
     coeff = 1.0
     if '亿' in text:
         coeff *= 100000000
     if '万' in text:
         coeff *= 10000
     if '千' in text or '仟' in text:
         coeff *= 1000
     if '百' in text or '佰' in text:
         coeff *= 100
     if '%' in text:
         coeff *= 0.01
     try:
         number = float(TextUtils.extract_number(text))
         number_text = '%.4f' % (number * coeff)
         if number_text.endswith('.0'):
             return number_text[:-2]
         elif number_text.endswith('.00'):
             return number_text[:-3]
         elif number_text.endswith('.000'):
             return number_text[:-4]
         elif number_text.endswith('.0000'):
             return number_text[:-5]
         else:
             if '.' in number_text:
                 idx = len(number_text)
                 while idx > 1 and number_text[idx - 1] == '0':
                     idx -= 1
                 number_text = number_text[:idx]
             return number_text
     except:
         return text
Beispiel #11
0
def main():

    try:
        gcp = GCPConnector(PROJECT_ID)

        # This is a basic input configuration object - you might want to use a
        # different approach (e.g. input fields in a Spreadshet) to allow a more
        # flexible configuration.
        config_data = [['account1', 'Account Number 1'],
                       ['account2', 'Account Number 2']]

        for row in config_data:

            # Add params to be passed via task payload
            task_params = dict()
            task_params['service'] = SERVICE_NAME  # Mandatory field
            task_params['run_script'] = GCE_RUN_SCRIPT  # Mandatory field
            task_params['bucket'] = GCS_BUCKET
            task_params['dataset'] = GBQ_DATASET

            # And add service-specific params
            task_params['account_id'] = row[0]
            task_params['label'] = row[1]
            task_params['schema'] = DATA_SCHEMA
            task_params['filename'] = TextUtils.timestamp() + '_' + str(
                task_params['account_id']) + '.csv'
            task_params['table'] = GBQ_TABLE
            task_params['append'] = True

            service_template_run.service_task(gcp, task_params)

    # pylint: disable=broad-except
    except Exception as e:
        print e.message
Beispiel #12
0
    def getreportdata(self, report_id, sanitize_rows=True):
        """Get report data.

    Download report data, once executed. This methods blocks until report
    has finished executing.

    Args:
      report_id: ID of the report to be downloaded
      sanitize_rows: Whether to remove commas and quotes from the rows in the
        report or not
    Returns:
      List with all the rows in the report
    """
        dataio = self.__getreportdataraw(report_id)
        reader = csv.reader(dataio)
        data = list()
        for row in reader:
            temp_row = row
            if sanitize_rows:
                temp_row = TextUtils.removecommas(temp_row)
                temp_row = TextUtils.removequotes(temp_row)
            data.append(temp_row)
        return data
Beispiel #13
0
    def count(self, target):
        """Target text count in document"""
        if not target:
            return 0

        index = self._char_pos_map[target[0]]
        if len(index) == 0:
            return 0

        if len(target) == 1:
            return len(index)

        count = 0
        for pos in index:
            if TextUtils.match(self._document, pos, target):
                count += 1

        return count
Beispiel #14
0
    def find(self, target):
        """All target text matching start index in document
        Yields:
            index_list
        """
        if not target:
            yield 0
        # 获取候选词的第一个字符出现在文本中的所有下标
        index = self._char_pos_map[target[0]]
        if len(index) == 0:
            yield 0

        if len(target) == 1:
            for pos in index:
                yield pos

        for pos in index:
            if TextUtils.match(self._document, pos, target):
                yield pos
Beispiel #15
0
def main():
    """ 
         Start the Slack Client 
    """
    os.system("clear; figlet 'Slack Gitsin' | lolcat")
    history = FileHistory(os.path.expanduser("~/.slackHistory"))
    while True:
        text = prompt("slack> ", history=history,
                      auto_suggest=AutoSuggestFromHistory(),
                      on_abort=AbortAction.RETRY,
                      style=DocumentStyle,
                      completer=Completer(fuzzy_match=False,
                                          text_utils=TextUtils()),
                      complete_while_typing=Always(),
                      get_bottom_toolbar_tokens=get_bottom_toolbar_tokens,
                      key_bindings_registry=manager.registry,
                      accept_action=AcceptAction.RETURN_DOCUMENT
        )
        slack = Slack(text)
        slack.run_command()
Beispiel #16
0
def main():

    try:
        gcp = GCPConnector(PROJECT_ID)
        dcm = DCMConnector(credential_file=CREDENTIAL_FILE,
                           user_email=None,
                           profile_id=DCM_PROFILE_ID,
                           api_version=DCM_API_VER)

        # In this example, we're mocking the config parameters (DCM partner and
        # advertiser IDs respectively):
        config_data = [['1234', '1111111'], ['5678', '2222222']]

        for row in config_data:

            # Add params to be passed via task payload
            task_params = dict()
            task_params['service'] = SERVICE_NAME  # Mandatory field
            task_params['run_script'] = GCE_RUN_SCRIPT  # Mandatory field
            task_params['account_id'] = row[0]
            task_params['advertiser_id'] = row[1]
            task_params['bucket'] = GCS_BUCKET
            task_params['dataset'] = GBQ_DATASET

            # And add service-specific params
            task_params['report_template'] = DCM_REPORT_TEMPLATE
            task_params['report_name'] = DCM_REPORT_NAME
            task_params['date_range'] = DCM_REPORT_DATE_RANGE
            task_params['schema'] = DATA_SCHEMA_STANDARD
            task_params['filename'] = TextUtils.timestamp() + '_' + str(
                task_params['account_id']) + '.csv'
            task_params['table'] = GBQ_TABLE
            task_params['append'] = True

            service_example_run.service_task(dcm, gcp, task_params)

    # pylint: disable=broad-except
    except Exception as e:
        print e.message
Beispiel #17
0
 def parse_content(self, html_file_path):
     """
     解析 HTML 中的段落文本
     按顺序返回多个 paragraph 构成一个数组,
     每个 paragraph 是一个 content 行构成的数组
     :param html_file_path:
     :return:
     """
     rs = []
     with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp:
         soup = BeautifulSoup(fp.read(), "html.parser")
         paragraphs = []
         for div in soup.find_all('div'):
             div_type = div.get('type')
             #添加div_type == 'paragraph'的div块中的文本
             if div_type is not None and div_type == 'paragraph':
                 paragraphs.append(div)
         for paragraph_div in paragraphs:
             has_sub_paragraph = False  #判断paragraph中是否有子paragraph
             for div in paragraph_div.find_all('div'):
                 div_type = div.get('type')
                 if div_type is not None and div_type == 'paragraph':
                     has_sub_paragraph = True
             if has_sub_paragraph:
                 continue  #若存在子paragraph则continue,因为后面会遍历到该paragraph
             rs.append([])  #每个paragraphs中的content保存在rs的子列表中
             #将paragraph中的content添加到列表中
             for content_div in paragraph_div.find_all('div'):
                 div_type = content_div.get('type')
                 if div_type is not None and div_type == 'content':
                     rs[-1].append(TextUtils.clean_text(content_div.text))
     paragraphs = []
     for content_list in rs:
         if len(content_list) > 0:
             paragraphs.append(
                 ''.join(content_list))  #每个content_list结合在一起成为一个字符串
     return paragraphs
Beispiel #18
0
 def get_character_aspects(self, char=None):
     char = char if char else self
     available = self.get_invokable_objects(char)
     aspects_strings = []
     for obj in available:
         if obj['char'].high_concept:
             name = TextUtils.clean(obj['char'].name)
             high_concept = TextUtils.clean(obj['char'].high_concept)
             aspects_strings.append(
                 f'***{high_concept}*** (High Concept of _\'{name}\'_)')
         if obj['char'].trouble:
             name = TextUtils.clean(obj['char'].name)
             trouble = TextUtils.clean(obj['char'].trouble)
             aspects_strings.append(
                 f'***{trouble}*** (Trouble of _\'{name}\'_)')
         if obj.category in ['Aspect', 'Stunt']:
             name = TextUtils.clean(obj['char'].name)
             category = TextUtils.clean(obj['char'].category) + (
                 ' _(Boost)_ ' if self.is_boost else '')
             parent = TextUtils.clean(obj['parent'].name)
             aspects_strings.append(
                 f'***{name}*** ({category} of _\'{parent}\'_)')
     return aspects_strings
Beispiel #19
0
def task_manager():
  """Task manager function.

  Looks for tasks in the DNA queues and launches CE instances accordingly.
  Returns:
      Standard 'OK' string to confirm completed execution.
  """
  gcp = GCPConnector(PROJECT_ID)
  for level in ['l0', 'l1', 'l2', 'l3']:

    zone = GCE_MACHINE_MAP[level]['zone']
    queue = GCE_MACHINE_MAP[level]['queue']
    quota = GCE_MACHINE_MAP[level]['quota']
    vm_type = GCE_MACHINE_MAP[level]['type']

    # Retrieve the list of existing instances if any
    num_running_ce = 0
    ce_list = gcp.gce_listinstances(zone)
    if 'items' in ce_list:
      for item in ce_list['items']:
        starting_idx = item['machineType'].find('machineTypes/')
        mtype = item['machineType'][starting_idx + 13:]
        if mtype == vm_type:
          num_running_ce += 1

    # Check how many tasks are in the queue and calculate the number of CE
    # machines to be created
    task_list = gcp.gct_listtasks(queue)
    logging.debug('%s elements in queue [%s]', len(task_list), queue)
    if 'tasks' in task_list:
      num_tasks = len(task_list['tasks'])
      pool_size = quota - num_running_ce
      pool_size = min(pool_size, num_tasks)
    else:
      logging.debug('No \'tasks\' in Cloud Task queue')
      # No tasks in the queue
      pool_size = 0
    logging.debug('Level: [%s]. Pool size: %s', level, pool_size)
    # Create a pool of CE instances if pool_size>0
    if pool_size > 0:
      for i in range(pool_size):
        machine_id = '%s-%s' % (level, str(i))
        instance_name = 'dna-machine-%s-%s' % (machine_id,
                                               str(uuid.uuid1().hex))
        logging.debug('Configuring new machine. ID: [%s]. Instance name: [%s]',
                      machine_id, instance_name)

        # Insert a new Datastore entry for each CE instance
        ce_entity = gcp.gds_insert(
            kind=GDS_KIND_CE_INSTANCE,
            attributes={
                'name': instance_name,
                'zone': zone,
                'created': TextUtils.timestamp(),
                't0': time.time(),
                'status': None
            })

        # Get the basic configuration as defined in the GCPConnector class
        ce_config = gcp.gce_configinstance(
            name=instance_name,
            zone=zone,
            machine_type=vm_type,
            service_account=GCE_SERVICE_ACCOUNT,
            scopes=GCE_SCOPES)

        # Add some metadata
        ce_config = gcp.gce_addmetadata(
            config=ce_config,
            key='startup-script-url',
            value=GCE_STARTUP_SCRIPT)

        ce_config = gcp.gce_addmetadata(
            config=ce_config,
            key='shutdown-script-url',
            value=GCE_SHUTDOWN_SCRIPT)

        ce_config = gcp.gce_addmetadata(
            config=ce_config,
            key='machine-id',
            value=machine_id)

        ce_config = gcp.gce_addmetadata(
            config=ce_config,
            key='ce-entity-id',
            value=str(ce_entity.key.id))

        ce_config = gcp.gce_addmetadata(
            config=ce_config,
            key='project-root',
            value=GCS_PROJECT_ROOT)

        ce_config = gcp.gce_addmetadata(
            config=ce_config,
            key='level',
            value=level)

        # Create the instance
        gcp.gce_createinstance(ce_config)
        logging.debug('Instance created. Name: [%s]', instance_name)
        # Update the status of the corresponding Datastore entity
        ce_entity['status'] = DNA_STATUS_CREATED
        gcp.gds_update(ce_entity)
        logging.debug('CE DataStore entity updated')
  return 'OK'
 def getLongFromText(text):
     return TextUtils.remove_comma_in_number(text)
from selenium import webdriver
import mysql.connector

from utils import TextUtils


def avatar_url_from_vid(vid):
    return "https://i.ytimg.com/vi/%s/default_live.jpg" % vid


def cover_img_url_from_vid(vid):
    return "https://i.ytimg.com/vi/%s/maxresdefault.jpg" % vid


good_video = [
    lambda v: not TextUtils.is_blank(v['title']),
    lambda v: TextUtils.is_plain_text(v['title']),
]


def parse_video_web_url(u):
    up = urlparse.urlparse(u)

    filtered_fields = ('index', 'list')

    new_query = {}
    for k, v in urlparse.parse_qs(up.query, keep_blank_values=True, strict_parsing=True).items():
        if k not in filtered_fields:
            new_query[k] = v[0] if isinstance(v, list) else v

    return urlparse.urlunparse((up.scheme, up.netloc, up.path, up.params,
Beispiel #22
0
 def tokenize(self, s: str) -> Sequence:
     s = TextUtils.normalize_newlines(s)
     return seq(re.split(r"\n{2,}", s))
Beispiel #23
0
 def tokenize(self, s: str) -> Sequence:
     s = TextUtils.normalize_newlines(s)
     return seq(re.split(r"\n", s)).filter(TextUtils.has_content)
def get_commentary(soup):
    global last_comment
    global has_updates

    root_div_tags_children = soup.find(
        "article", {"class": "sub-module match-commentary cricket"})

    if (root_div_tags_children is None):
        root_div_tags_children = soup.find(
            "article",
            {"class": "sub-module match-commentary cricket add-padding"})

    if (root_div_tags_children is None):
        LOGGER.error_with_time("Couldn't find article class. Aborting.")
        exit(1)

    root_div_tags_children = root_div_tags_children.find(
        "div", {"class": "content"})

    if (root_div_tags_children is None):
        LOGGER.error_with_time(
            "Couldn't find div for root_div_tags_children. Aborting.")
        exit(1)

    commentary = []

    for commentary_item in root_div_tags_children:
        over = commentary_item.find("div", {"class": "time-stamp"})
        description = commentary_item.find("div", {"class": "description"})

        if (over is None):
            over = ""
        else:
            over = TextUtils.replaceQuotesInText(over.text)

        if (description is None or properties.IS_TEST_MODE):
            description = ""
        else:
            description = TextUtils.replaceQuotesInText(description.text)

        comment = Comment(over, description)
        paragraphs = commentary_item.findAll("p", {"class": "comment"})

        if (paragraphs is None):
            paragraphs = []

        if not properties.IS_TEST_MODE:
            for p in paragraphs:
                p = TextUtils.replaceQuotesInText(p.text)
                comment.add_paragraph(p)

        if (len(over) != 0 or len(description) != 0
                or len(comment.paragraphs) != 0):
            commentary.append(comment)

    commentary.reverse()
    ind = -1

    for i in range(len(commentary)):
        if (commentary[i].over == last_comment.over):
            ind = i

            break

    if (ind >= 0
            and (commentary[ind].description != last_comment.description
                 or commentary[ind].paragraphs != last_comment.paragraphs)):
        ind -= 1

    commentary = commentary[(ind + 1):len(commentary)]

    if (len(commentary) > 0):
        last_comment = commentary[-1]
    else:
        has_updates = False

    return commentary
Beispiel #25
0
import pandas as pd
import tensorflow as tf
from nn.similarNN import Model
import _pickle as cPickle
from utils import TextUtils

save_model_class = './saved/24072018/model.pkl'
save_model_deep = './saved/24072018/model.ckpt'

data_file_path = './data/cikm_test_a_20180516.txt'
data_file_headers = ['spa_sent_1', 'spa_sent_2']

if __name__ == '__main__':
    text_util = TextUtils()
    text_util.pad_id = 1
    text_util.unk_id = 0
    """
    Restore model
    """
    model = cPickle.load(open(save_model_class, 'rb'))
    model.build(build_session=True, init_word_embedding=None)

    model.restore(save_model_deep)
    """
    Load data
    """
    data_df = pd.read_csv(data_file_path,
                          sep='\t',
                          header=None,
                          names=data_file_headers)
    """
Beispiel #26
0
def main(argv):
  """Main function reading the task and launching the corresponding ETL job.

  Args:
    argv: array of parameters: (1) queue name, (2) task id.
  """

  # Get input arguments passed by the service-example-run.sh script
  queue_name = str(argv[1])
  task_name = str(argv[2])

  logger.info('Starting service-example processing task. Queue name: [%s]. '
              'Task name: [%s]', queue_name, task_name)

  # Initiate connectors for Google Cloud Platform and DCM.
  gcp = GCPConnector(PROJECT_ID)
  dcm = DCMConnector(
      credential_file=CREDENTIAL_FILE,
      user_email=None,
      profile_id=DCM_PROFILE_ID,
      api_version=DCM_API_VER)

  # Get the first available task from the queue.
  task = gcp.gct_gettask(task_name)
  payload = task['pullMessage']['payload']
  params = json.loads(base64.urlsafe_b64decode(str(payload)))

  # Add service-specific params.
  params['report_template'] = DCM_REPORT_TEMPLATE
  params['report_name'] = DCM_REPORT_NAME
  params['date_range'] = DCM_REPORT_DATE_RANGE
  params['schema'] = DATA_SCHEMA_STANDARD
  params['filename'] = TextUtils.timestamp() + '_' + str(
      params['account_id']) + '.csv'
  params['table'] = GBQ_TABLE
  params['append'] = False

  # Log run info as Datastore entity.
  run_entity = gcp.gds_insert(
      kind=GDS_KIND_LOG_SERVICE,
      attributes={
          'created': TextUtils.timestamp().decode(),
          'service': params['service'].decode(),
          'status': u'RUNNING',
          'error': None,
          'bqjob': None,
          'bqstatus': None,
      })
  try:
    # Run the ETL task with the given params and update the Datastore entity.
    job_id = service_task(dcm, gcp, params)
    run_entity['bqjob'] = job_id.decode()
    run_entity['bqstatus'] = u'RUNNING'
    run_entity['status'] = u'DONE'

  # pylint: disable=broad-except
  except Exception as e:
    run_entity['status'] = u'FAILED'
    run_entity['error'] = str(e).decode()
    logger.error(
        '[%s] - The following error occurs while executing task <%s> : <%s>',
        SERVICE_NAME, task_name, str(e))
  finally:
    gcp.gds_update(run_entity)
 def get_long_from_text(text):
     return TextUtils.remove_comma_in_number(text)
Beispiel #28
0
def make_fold(train_df, test_df, save_model_class, save_model_deep):
    text_util = TextUtils()

    # preprocessing and tokenize
    train_spa_sent_1_df = train_df['spa_sent_1'].tolist()
    train_spa_sent_2_df = train_df['spa_sent_2'].tolist()

    test_spa_sent_1_df = test_df['spa_sent_1'].tolist()
    test_spa_sent_2_df = test_df['spa_sent_2'].tolist()

    train_spa_tokens_1 = text_util.tokenize(sentences=train_spa_sent_1_df,
                                            language=text_util.spanish)
    train_spa_tokens_2 = text_util.tokenize(sentences=train_spa_sent_2_df,
                                            language=text_util.spanish)
    test_spa_tokens_1 = text_util.tokenize(sentences=test_spa_sent_1_df,
                                           language=text_util.spanish)
    test_spa_tokens_2 = text_util.tokenize(sentences=test_spa_sent_2_df,
                                           language=text_util.spanish)

    # building vocabulary (#using only training dataset)
    train_spa_tokens = train_spa_tokens_1 + train_spa_tokens_2
    train_label_df = train_df['label'].tolist()

    (spa_id2word, spa_word2id), spa_E_by_id = text_util.create_word_vocab(
        lst_tokens=train_spa_tokens,
        word_dim=300,
        fasttext_path='./data/new/pretrained/mine.wiki.es.vec')
    (id2label, label2id) = text_util.create_label_vocab(labels=train_label_df)

    # builing dataset (mean convert token, label to its corressponding id)
    train_dataset = text_util.create_dataset(lst_tokens_1=train_spa_tokens_1,
                                             lst_tokens_2=train_spa_tokens_2,
                                             labels=train_label_df,
                                             label2id=label2id,
                                             word2id_1=spa_word2id,
                                             word2id_2=spa_word2id)

    test_dataset = text_util.create_dataset(lst_tokens_1=test_spa_tokens_1,
                                            lst_tokens_2=test_spa_tokens_2,
                                            labels=test_df['label'].tolist(),
                                            label2id=label2id,
                                            word2id_1=spa_word2id,
                                            word2id_2=spa_word2id)

    # create batch
    train_batches = text_util.create_batch(dataset=train_dataset,
                                           batch_size=batch_size)
    test_batches = text_util.create_batch(dataset=test_dataset,
                                          batch_size=batch_size)

    # training
    train_score = train(train_batchs=train_batches,
                        test_batchs=test_batches,
                        n_epoch=n_epoch,
                        init_lr=init_lr,
                        init_keep_prob=init_keep_prob,
                        init_word_emb=spa_E_by_id,
                        text_util=text_util,
                        save_model_class=save_model_class,
                        save_model_deep=save_model_deep,
                        word2id=spa_word2id,
                        label2id=label2id)

    return train_score