def get_cookie(email,passw,filter_): #function returns vk.com cookie; arguments -- email, password & news filter url = 'http://login.vk.com/?act=login' values = { 'expire' : '', 'vk': '1', 'email' : email, 'pass' : passw } data = urllib.urlencode(values) headers = { 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2', 'Accept': 'text/html,application/xhtml+xml,application/xml', 'Accept-Language': 'en-us,en', 'Accept-Charset': 'ISO-8859-1,utf-8', 'Referer': 'http://vk.com/index.php', 'Cookie': 'remixlang=3; remixchk=5', 'Content-Type': 'application/x-www-form-urlencoded', 'Content-Length': str(len(data)) } Response = urllib2.urlopen(urllib2.Request(url, data, headers)) string = Response.read() string = get_between(string,0,'\'s\'','/>') if string == None: print 'None :(' return None values = 'remixlang=3; remixchk=5; remixsid=' + get_between(string,0,'value=\'','\'') + '; remixnews_privacy_filter=0; remixnews_types=' + filter_ return values
def pars_all(page): initial_date = 0 statuses = [] date = get_between(page,0,'<div class="feedDay">','</div>') iterations_counter = 0 while iterations_counter < 55: iterations_counter += 1 if page.find('<div class="feedDay">',initial_date) == -1: break if initial_date != 0: date = get_between(page,initial_date - len('<div class="feedDay">') - 2,'<div class="feedDay">','</div>') if not date: print 'can\'t pars date [pars_all function]' return None fragment = get_between(page,initial_date,'<div class="feedDay">','<div class="feedDay">') if fragment == None: fragment = page[page.find('<div class="feedDay">',initial_date):] try: statuses.extend(pars_fragment(fragment,date)); except TypeError: print 'pars_fragment returns invalid value [pars_all function]' return None break try: statuses.extend(pars_fragment(fragment,date)); except TypeError: print 'pars_fragment returns invalid value [pars_all function]' return None #searching in the next iteration will be start from the character with number: initial_date = page.find('<div class="feedDay">',page.find('<div class="feedDay">',initial_date) + 1) if iterations_counter == 55: print 'pars error [pars_all function]' return None return statuses
def pars_status(page, initial): #extract status, uid & name from vk-page, all news except statuses must be disabled if page.find('feedStory',initial) == -1: return '', '', '', '' string = get_between(page, initial,'<td class="feedStory">','</td>') string_regexp = re.compile(r'href="/(?P<uid>.+?)">(?P<name>.+?)</a>(?P<status_with_whitespaces>.*?)(<div|$)',re.DOTALL) string_match = string_regexp.search(string) if string_match == None: print 'invalid status format [pars_status function]' return '0','0','0','0' uid = string_match.group('uid') name = string_match.group('name') status = re.sub(r'\s*$','',re.sub(r'^\s*','',string_match.group('status_with_whitespaces'))) #remove leftmost and rightmost whitespaces if len(status) == 0: return '0','0','0','0' time = get_between(get_between(page,initial,'<td class="feedTime">','</td>'),0,'<div>','</div>') return uid, name, status, time