def parse(page): try: t = eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return info info = {}
def parse(page): try: t=eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return info info = {}
def parse_csdn(page): try: t = eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return info r = [] sel = [] find = t.cssselect('div[class="per_dynamic"]') if find: titles = find[0].xpath('//a[contains(@href,"http:")]') for li in titles: r.append(li.text_content().strip().replace('\n', ' ')) print "======", r sel2 = t.cssselect('div[class="position education vevent vcard"]') for li in sel2: sel.append(li) for li in sel: item = {} find = li.cssselect('h3[class="summary fn org"]') if find: item['school'] = find[0].text_content().strip().replace('\n', ' ') find = li.cssselect('span[class="degree"]') if find: item['degree'] = find[0].text_content().strip().replace('\n', ' ') find = li.cssselect('span[class="major"]') if find: item['major'] = find[0].text_content().strip().replace('\n', ' ') find = li.cssselect('abbr[class="dtstart"]') if find: item['dtstart'] = find[0].text_content().strip().replace('\n', ' ') find = li.cssselect('abbr[class="dtstamp"]') if find: item['dtend'] = find[0].text_content().strip().replace('\n', ' ') find = li.cssselect('abbr[class="dtend"]') if find: item['dtend'] = find[0].text_content().strip().replace('\n', ' ') find = li.cssselect('p[class=" desc details-education"]') if find: item['desc'] = find[0].text_content().strip().replace('\n', ' ') find = li.cssselect('p[class="desc details-education"]') if find: item['activities'] = find[0].text_content().strip().replace( '\n', ' ') r.append(item) return r
def parse(page): info = {} try: t=eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return info info = parse_profile(t) dynamic = parse_dynamic(t) info['dynamic'] = dynamic return info
def parse_dir(t): r = [] try: t=eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return r sel = t.cssselect('h2 strong a') for li in sel: r.append(li.attrib['href']) return r
def parse_csdn(page): try: t=eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return info r = [] sel = [] find = t.cssselect('div[class="per_dynamic"]') if find: titles = find[0].xpath('//a[contains(@href,"http:")]') for li in titles: r.append(li.text_content().strip().replace('\n',' ')) print "======",r sel2 = t.cssselect('div[class="position education vevent vcard"]') for li in sel2: sel.append(li) for li in sel: item = {} find = li.cssselect('h3[class="summary fn org"]') if find: item['school'] = find[0].text_content().strip().replace('\n',' ') find = li.cssselect('span[class="degree"]') if find: item['degree'] = find[0].text_content().strip().replace('\n',' ') find = li.cssselect('span[class="major"]') if find: item['major'] = find[0].text_content().strip().replace('\n',' ') find = li.cssselect('abbr[class="dtstart"]') if find: item['dtstart'] = find[0].text_content().strip().replace('\n',' ') find = li.cssselect('abbr[class="dtstamp"]') if find: item['dtend'] = find[0].text_content().strip().replace('\n',' ') find = li.cssselect('abbr[class="dtend"]') if find: item['dtend'] = find[0].text_content().strip().replace('\n',' ') find = li.cssselect('p[class=" desc details-education"]') if find: item['desc'] = find[0].text_content().strip().replace('\n',' ') find = li.cssselect('p[class="desc details-education"]') if find: item['activities'] = find[0].text_content().strip().replace('\n',' ') r.append(item) return r
def parse(page): info = {} try: t=eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return info info = parse_addition(t) work = parse_work(t) info['work'] = work edu = parse_edu(t) info['edu'] = edu return info
def parse(page): info = {} try: t = eparse(page) except lxml.etree.ParserError: print sys.stderr, "page content error" return info info = parse_addition(t) work = parse_work(t) info['work'] = work edu = parse_edu(t) info['edu'] = edu return info