def guancha(keyword, page):
    global time
    
    driver = webdriver.PhantomJS()
    url = 'https://user.guancha.cn/main/search?click=news&keyword=' + urllib.quote(keyword)
    driver.get(url)

    for x in range(1, page):
        print('Start fetching page ' + str(x) + '...')
        driver.find_element_by_class_name('index-add-more').click()
        time.sleep(3)

    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')
    # print(soup.prettify())
    title_tag = soup.find_all('div', class_='list-item')

    s = "%Y%m%d%H%M%S"
    with open(datetime.datetime.now().strftime(s) + '.csv', 'w') as csvFile:
        writer = csv.writer(csvFile, dialect='excel')
        
        writer.writerow(['title','url','date'])

        for tag in title_tag:
            h4 = tag.find('h4')
            # print(str(h4.a.attrs['href']))
            time = str(h4.a.attrs['href']).rsplit('/',1)[1].replace('.shtml','')
            times = time.rsplit('_',1)[0]
            # print(h4.a.text)
            print(times.replace('_','/') + ' ' + h4.a.text)
            writer.writerow([h4.a.text.encode('utf-8'), h4.a.attrs['href'], times.replace('_','/')])
Example #2
0
    def parse_traza(self, traza):
        """@brief Toma el nombre de un fichero de traza y devuelve 3 valores,
        fichero, caso, time
        @param traza El nombre del fichero de traza.
        @retval Los 3 valores fich, caso, time 
        """
        # Estructura de los nombres
        # 0(fichero):1(caso):2(timestamp)
        # Ejemplo: LoanApprovalProcess.bpts:LargeAmount-1267033799.94.log
        # El timestamp se encuentra en segundos y es obtenido con time.time()
        try:
            fich, caso, time = traza.split(":")
            time = time.rsplit(".", 1)[0]
        except:
            # log.warning(_("Hay una traza que no sigue el formato: " + traza))
            return "", "", ""

        return fich, caso, time
Example #3
0
 def _parse_iso_8601(value):
     """
     Parses an ISO8601:2004 date time string.
     """
     # remove trailing 'Z'
     value = value.replace('Z', '')
     # split between date and time
     try:
         (date, time) = value.split("T")
     except:
         date = value
         time = ""
     # remove all hyphens in date
     date = date.replace('-', '')
     # remove colons in time
     time = time.replace(':', '')
     # guess date pattern
     length_date = len(date)
     if date.count('W') == 1 and length_date == 8:
         # we got a week date: YYYYWwwD
         # remove week indicator 'W'
         date = date.replace('W', '')
         date_pattern = "%Y%W%w"
         year = int(date[0:4])
         # [Www] is the week number prefixed by the letter 'W', from W01
         # through W53.
         # strpftime %W == Week number of the year (Monday as the first day
         # of the week) as a decimal number [00,53]. All days in a new year
         # preceding the first Monday are considered to be in week 0.
         week = int(date[4:6]) - 1
         # [D] is the weekday number, from 1 through 7, beginning with
         # Monday and ending with Sunday.
         # strpftime %w == Weekday as a decimal number [0(Sunday),6]
         day = int(date[6])
         if day == 7:
             day = 0
         date = "%04d%02d%1d" % (year, week, day)
     elif length_date == 7 and date.isdigit() and value.count('-') != 2:
         # we got a ordinal date: YYYYDDD
         date_pattern = "%Y%j"
     elif length_date == 8 and date.isdigit():
         # we got a calendar date: YYYYMMDD
         date_pattern = "%Y%m%d"
     else:
         raise ValueError("Wrong or incomplete ISO8601:2004 date format")
     # check for time zone information
     # note that the zone designator is the actual offset from UTC and
     # does not include any information on daylight saving time
     if time.count('+') == 1 and '+' in time[-6:]:
         (time, tz) = time.rsplit('+')
         delta = -1
     elif time.count('-') == 1 and '-' in time[-6:]:
         (time, tz) = time.rsplit('-')
         delta = 1
     else:
         delta = 0
     if delta:
         while len(tz) < 3:
             tz += '0'
         delta = delta * (int(tz[0:2]) * 60 * 60 + int(tz[2:]) * 60)
     # split microseconds
     ms = 0
     if '.' in time:
         (time, ms) = time.split(".")
         ms = float('0.' + ms.strip())
     # guess time pattern
     length_time = len(time)
     if length_time == 6 and time.isdigit():
         time_pattern = "%H%M%S"
     elif length_time == 4 and time.isdigit():
         time_pattern = "%H%M"
     elif length_time == 2 and time.isdigit():
         time_pattern = "%H"
     elif length_time == 0:
         time_pattern = ""
     else:
         raise ValueError("Wrong or incomplete ISO8601:2004 time format")
     # parse patterns
     dt = datetime.datetime.strptime(date + 'T' + time,
                                     date_pattern + 'T' + time_pattern)
     # add microseconds and eventually correct time zone
     return UTCDateTime(dt) + (float(delta) + ms)
Example #4
0
 def _parseISO8601(value):
     """
     Parses an ISO8601:2004 date time string.
     """
     # remove trailing 'Z'
     value = value.replace('Z', '')
     # split between date and time
     try:
         (date, time) = value.split("T")
     except:
         date = value
         time = ""
     # remove all hyphens in date
     date = date.replace('-', '')
     # remove colons in time
     time = time.replace(':', '')
     # guess date pattern
     length_date = len(date)
     if date.count('W') == 1 and length_date == 8:
         # we got a week date: YYYYWwwD
         # remove week indicator 'W'
         date = date.replace('W', '')
         date_pattern = "%Y%W%w"
         year = int(date[0:4])
         # [Www] is the week number prefixed by the letter 'W', from W01
         # through W53.
         # strpftime %W == Week number of the year (Monday as the first day
         # of the week) as a decimal number [00,53]. All days in a new year
         # preceding the first Monday are considered to be in week 0.
         week = int(date[4:6]) - 1
         # [D] is the weekday number, from 1 through 7, beginning with
         # Monday and ending with Sunday.
         # strpftime %w == Weekday as a decimal number [0(Sunday),6]
         day = int(date[6])
         if day == 7:
             day = 0
         date = "%04d%02d%1d" % (year, week, day)
     elif length_date == 7 and date.isdigit() and value.count('-') != 2:
         # we got a ordinal date: YYYYDDD
         date_pattern = "%Y%j"
     elif length_date == 8 and date.isdigit():
         # we got a calendar date: YYYYMMDD
         date_pattern = "%Y%m%d"
     else:
         raise ValueError("Wrong or incomplete ISO8601:2004 date format")
     # check for time zone information
     # note that the zone designator is the actual offset from UTC and
     # does not include any information on daylight saving time
     if time.count('+') == 1 and '+' in time[-6:]:
         (time, tz) = time.rsplit('+')
         delta = -1
     elif time.count('-') == 1 and '-' in time[-6:]:
         (time, tz) = time.rsplit('-')
         delta = 1
     else:
         delta = 0
     if delta:
         tz = tz.replace(':', '')  # XXX: not needed
         while len(tz) < 3:
             tz += '0'
         delta = delta * (int(tz[0:2]) * 60 * 60 + int(tz[2:]) * 60)
     # split microseconds
     ms = 0
     if '.' in time:
         (time, ms) = time.split(".")
         ms = float('0.' + ms.strip())
     # guess time pattern
     length_time = len(time)
     if length_time == 6 and time.isdigit():
         time_pattern = "%H%M%S"
     elif length_time == 4 and time.isdigit():
         time_pattern = "%H%M"
     elif length_time == 2 and time.isdigit():
         time_pattern = "%H"
     elif length_time == 0:
         time_pattern = ""
     else:
         raise ValueError("Wrong or incomplete ISO8601:2004 time format")
     # parse patterns
     dt = datetime.datetime.strptime(date + 'T' + time,
                                     date_pattern + 'T' + time_pattern)
     # add microseconds and eventually correct time zone
     return UTCDateTime(dt) + (float(delta) + ms)