Example #1
0
 def parse_item_page(self, response):
     item_data = {
         "title":
         remove_unicode(
             response.xpath('//meta[@property="og:title"]/@content').
             extract()[0].strip()),
         "author":
         " ".join(
             response.xpath('//span[@class="author"]//text()').extract()
             [1:-1]).strip(),
         "date":
         parse(response.xpath(
             '//meta[@property="article:published_time"]/@content').extract(
             )[0].strip(),
               fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S"),
         "description":
         remove_unicode(
             response.xpath('//meta[@property="og:description"]/@content').
             extract()[0].strip()),
         "content":
         self._get_content(response),
         "url":
         response.url,
     }
     yield NewsItem(**item_data)
Example #2
0
 def parse(self, response):
     item_data = {
         "title": remove_unicode(response.xpath('//meta[@name="dc.title"]/@content').extract()[0].strip()),
         "author": " ".join(response.xpath('//*[@class="article-source"]//text()').extract()).strip(),
         "date": parse(response.xpath('//meta[@name="dc.date"]/@content').extract()[0], fuzzy=True).strftime(
             "%Y-%m-%dT%H:%M:%S"),
         "description": remove_unicode(
             response.xpath('//meta[@name="dc.description"]/@content').extract()[0].strip()),
         "content": remove_unicode(
             ' '.join(response.xpath('//*[@class="article-body"]/p//text()').extract()).strip()),
         "url": response.url,
     }
     yield NewsItem(**item_data)
Example #3
0
 def _get_content(self, response, string="SUBSCRIBE"):
     ps = response.xpath(
         '//article//p//*[not(self::script)]//text()').extract()
     if string in ps:
         ps = ps[:ps.index(string)]
     ps = map(lambda s: s.strip(), ps)
     return remove_unicode(" ".join(ps).strip())
Example #4
0
 def process(self, tweet):
     removed_url = re.sub(r'http\S+', '', tweet.text)
     processed_text = html.unescape(removed_url)
     text = utils.remove_unicode(processed_text)
     tweet_user = tweet.user.screen_name
     tweet_time = self.process_tweet_time(tweet.created_at)
     return "Posted by @{} on {}:".format(tweet_user, tweet_time), text
Example #5
0
    def parse(self, response):
        item = NewsItem()

        type = response.xpath('//meta[@property="og:type"]//@content').extract_first()

        if type is None or "article" not in type:
            return

        item['url'] = response.url
        item['date'] = parse(
            response.xpath('//*[@id="article-feed"]/article[1]//span[@class="timestamp"]').extract()[0],
            fuzzy=True).strftime("%Y-%m-%dT%H:%M:%S")

        try:
            item['author'] = " ".join(
                response.xpath('//*[@id="article-feed"]/article[1]//div[@class="author"]//text()')
                    .extract()).strip()
        except IndexError:
            item['author'] = ''
        item['title'] = response.xpath('//meta[@property="og:title"]//@content').extract()[0].strip()
        item['description'] = response.xpath(
            '//meta[@property="og:description"]//@content').extract_first().rstrip()

        item['content'] = remove_unicode(' '.join(response.xpath(
            '//*[@id="article-feed"]/article[1]//*[@class="article-body"]//*[@itemprop="articleBody"]//text()').extract()).rstrip())

        yield item
Example #6
0
    def parse(self, response):
        item = NewsItem()

        lang = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:locale"]//@content'
        ).extract_first()

        type = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:type"]//@content'
        ).extract_first()

        if lang is None or "en" not in lang or "article" not in type:
            return

        item['url'] = response.url

        try:
            item['date'] = datetime.utcfromtimestamp(float(
                response.xpath(
                    '//div[@class="story-body"]//div[contains(@class,"date date--v2")]//@data-seconds').extract_first())) \
                .strftime("%Y-%m-%dT%H:%M:%S")
        except TypeError:
            item['date'] = ''
        try:
            _author = response.xpath(
                '//*//span[@class="byline__name"]//text()').extract_first()
            if _author is None:
                item['author'] = 'BBC News'
            else:
                _author_split = _author.split(" ")
                if _author_split[0] == "By":
                    _author = " ".join(_author_split[1:])
                item['author'] = _author + " | BBC News"
            #
            # " ".join(
            #     response.xpath('//*[@id="responsive-news"]//meta[@property="article:author"]//@content')
            #         .extract()[0]).strip()
            #
            # intoarce https://www.facebook.com/bbcnews
        except IndexError:
            item['author'] = 'BBC News'

        item['title'] = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:title"]//@content'
        ).extract_first().strip()

        item['description'] = response.xpath(
            '//*[@id="responsive-news"]//meta[@property="og:description"]//@content'
        ).extract_first().rstrip()

        item['content'] = remove_unicode(' '.join(
            response.xpath(
                '//div[@class="story-body"]//div[@property="articleBody"]//p//text()'
            ).extract()).rstrip())

        yield item
Example #7
0
def read_data(filename: str,
              data_header: dict,
              hmap: dict,
              all_eps=True) -> dict:
    """
    - Assumes that if a "FULL NAME" column exists, all rows will have a format of
        'LastName, FirstName'.
    - Sometimes the header may have unicode (special) characters, cleans before use.
    - hmap is a map of the header fields with official MDS translations,
        where cleansing was required.
    - all_eps == False => Closed eps only
    """
    #data_header = fix_headers(data_header)
    with open(filename, 'r') as csvfile:
        csvfile.readline()
        reader = csv.DictReader(csvfile, data_header)

        if MDS['FNAME'] not in data_header and "FULL NAME" in data_header:
            rows = _split_fullname(reader)
            reader = rows
            data_header.remove("FULL NAME")
            data_header.extend([MDS['FNAME'], MDS['LNAME']])

        clean_headers = {dh: remove_unicode(dh) for dh in data_header}
        # [ch for ch in clean_headers.values() if ch in data_header] == data_header
        # True
        tmp_k = None
        result = []
        ii = 0
        for i, row in enumerate(reader):
            if "".join(row.values()) == '':
                logger.error(
                    f"\n\tFound Blank row at {i}. skipping to next row...")
                continue

            if not all_eps and not row[MDS['END_DATE']]:
                continue

            result.append({})
            for k, v in row.items():
                tmp_k = clean_headers[k]
                # if tmp_k in hmap:
                #     result[i][hmap[tmp_k]] = v
                # else:
                result[ii][tmp_k] = v
            ii = ii + 1

        #result = [ {k:v for k, v in row.items()} for row in reader if hmap[k]]

        return {"episodes": result}
Example #8
0
def _text_to_df(text_file):
    """
    Convert a raw E-Prime output text file into a pandas DataFrame.
    """
    # Load the text file as a list.
    with open(text_file, 'rb') as fo:
        text_data = list(fo)

    # Remove unicode characters.
    filtered_data = [
        remove_unicode(row.decode('utf-8', 'ignore')) for row in text_data
    ]

    # Determine where rows begin and end.
    start_index = [
        i for i, row in enumerate(filtered_data)
        if row == '*** LogFrame Start ***'
    ]
    end_index = [
        i for i, row in enumerate(filtered_data)
        if row == '*** LogFrame End ***'
    ]
    if len(start_index) != len(end_index) or start_index[0] >= end_index[0]:
        print('Warning: LogFrame Starts and Ends do not match up.')
    n_rows = min(len(start_index), len(end_index))

    # Find column headers and remove duplicates.
    headers = []
    data_by_rows = []
    for i in range(n_rows):
        one_row = filtered_data[start_index[i] + 1:end_index[i]]
        data_by_rows.append(one_row)
        for col_val in one_row:
            split_header_idx = col_val.index(':')
            headers.append(col_val[:split_header_idx])

    headers = list(OrderedDict.fromkeys(headers))

    # Preallocate list of lists composed of NULLs.
    data_matrix = np.empty((n_rows, len(headers)), dtype=object)
    data_matrix[:] = np.nan

    # Fill list of lists with relevant data from data_by_rows and headers.
    for i in range(n_rows):
        for cell_data in data_by_rows[i]:
            split_header_idx = cell_data.index(':')
            for k_header, header in enumerate(headers):
                if cell_data[:split_header_idx] == header:
                    data_matrix[i, k_header] = cell_data[split_header_idx +
                                                         1:].lstrip()

    df = pd.DataFrame(columns=headers, data=data_matrix)

    # Columns with one value at the beginning, the end, or end - 1 should be
    # filled with that value.
    for col in df.columns:
        non_nan_idx = np.where(df[col].values == df[col].values)[0]
        if len(non_nan_idx) == 1 and non_nan_idx[0] in [
                0, df.shape[0] - 1, df.shape[0] - 2
        ]:
            df.loc[:, col] = df.loc[non_nan_idx[0], col]
    return df