def analyze(s):
    d=eval(s)
    special_keys = []
    name = d['name']
    electoral_district_type = d['electoral_district_type']
    electoral_district_name = d['electoral_district_name']
    state = d['state']
    link = d['link']
    text = d['sitetext'].lower().decode('utf-8')
    name, last,first = conv.clean_name(name)
    for v in vocabulary:
        special_keys += [conv.search_to_feature_key(v)]*text.count(v.lower())
        text.replace(v.lower(),'')
    special_keys += [conv.search_to_feature_key('name')]*text.count(name.lower())
    special_keys += [conv.search_to_feature_key('last')]*text.count(last.lower())
    special_keys += [conv.search_to_feature_key('first')]*text.count(first.lower())
    special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+' for')
    special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'for')
    special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'4')
    special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote'+last.lower())
    special_keys += [conv.search_to_feature_key('forstate')]*text.count('for '+state.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +last.lower())
    special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +name.lower())
    special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +last.lower())
    special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote '+last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote for '+last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('votefor'+last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote4'+last.lower())
    try:
        special_keys += [conv.search_to_feature_key('politicianpublicfigure')]*len(re.findall(r'{last}.{{1,50}}(?:public figure|politician)'.format(last=re.escape(last.encode('utf-8'))), text))
    except:
        import pdb;pdb.set_trace()
    text.replace(name.lower(),'')
    text.replace(last.lower(),'')
    text.replace(first.lower(),'')
    special_keys += [conv.search_to_feature_key('electoral_district_type')]*sum(text.count(edt.lower()) for edt in conv.district_type_dict[electoral_district_type])
    special_keys += [conv.search_to_feature_key('officename')]*sum(text.count(on.lower()) for on in conv.office_names)
    special_keys += [conv.search_to_feature_key('electoral_district_name')]*text.count(electoral_district_name.lower())
    special_keys += [conv.search_to_feature_key('state')]*text.count(state.lower())
    if fb_page_data.has_key(conv.strip_and_std(link)):
        fb_page_dict = fb_page_data[conv.strip_and_std(link)]
        special_keys.append(conv.search_to_feature_key('fbdata'))
        fans = int(math.log(int(fb_page_dict['fans'])))
        special_keys += [conv.search_to_feature_key('fbdata')]*fans
        if fb_page_dict['authentic'] == 'Authentic':
            special_keys.append(conv.search_to_feature_key('fbauthentic'))
    name_key = conv.search_to_feature_key('name')
    last_key = conv.search_to_feature_key('last')
    first_key = conv.search_to_feature_key('first')
    #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key)
    return basic_analyze(text) + special_keys
def analyze(s):
    d=eval(s)
    special_keys = []
    name = d['name']
    electoral_district_type = d['electoral_district_type']
    electoral_district_name = d['electoral_district_name']
    state = d['state']
    link = d['link']
    text = d['sitetext'].lower().decode('utf-8')
    name, last,first = conv.clean_name(name)
    for v in vocabulary:
        special_keys += [conv.search_to_feature_key(v)]*text.count(v.lower())
        text.replace(v.lower(),'')
    special_keys += [conv.search_to_feature_key('name')]*text.count(name.lower())
    special_keys += [conv.search_to_feature_key('last')]*text.count(last.lower())
    special_keys += [conv.search_to_feature_key('first')]*text.count(first.lower())
    special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+' for')
    special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'for')
    special_keys += [conv.search_to_feature_key('lastfor')]*text.count(last.lower()+'4')
    special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote'+last.lower())
    special_keys += [conv.search_to_feature_key('forstate')]*text.count('for '+state.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('reelect ' +last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re elect ' +last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')]*text.count('re-elect ' +last.lower())
    special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +name.lower())
    special_keys += [conv.search_to_feature_key('electlast')]*text.count('elect ' +last.lower())
    special_keys += [conv.search_to_feature_key('votelast')]*text.count('vote '+last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote for '+last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('votefor'+last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')]*text.count('vote4'+last.lower())
    text.replace(name.lower(),'')
    text.replace(last.lower(),'')
    text.replace(first.lower(),'')
    special_keys += [conv.search_to_feature_key('electoral_district_type')]*sum(text.count(edt.lower()) for edt in conv.district_type_dict[electoral_district_type])
    special_keys += [conv.search_to_feature_key('officename')]*sum(text.count(on.lower()) for on in conv.office_names)
    special_keys += [conv.search_to_feature_key('electoral_district_name')]*text.count(electoral_district_name.lower())
    special_keys += [conv.search_to_feature_key('state')]*text.count(state.lower())
    name_key = conv.search_to_feature_key('name')
    last_key = conv.search_to_feature_key('last')
    first_key = conv.search_to_feature_key('first')
    #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key)
    return basic_analyze(text) + special_keys
Example #3
0
def analyze(s):
    d = eval(s)
    special_keys = []
    name = d['name']
    electoral_district_type = d['electoral_district_type']
    electoral_district_name = d['electoral_district_name']
    state = d['state']
    link = d['link']
    text = d['sitetext'].lower().decode('utf-8')
    name, last, first = conv.clean_name(name)
    for v in vocabulary:
        special_keys += [conv.search_to_feature_key(v)] * text.count(v.lower())
        text.replace(v.lower(), '')
    special_keys += [conv.search_to_feature_key('name')] * text.count(
        name.lower())
    special_keys += [conv.search_to_feature_key('last')] * text.count(
        last.lower())
    special_keys += [conv.search_to_feature_key('first')] * text.count(
        first.lower())
    special_keys += [conv.search_to_feature_key('lastfor')
                     ] * text.count(last.lower() + ' for')
    special_keys += [conv.search_to_feature_key('lastfor')
                     ] * text.count(last.lower() + 'for')
    special_keys += [conv.search_to_feature_key('lastfor')
                     ] * text.count(last.lower() + '4')
    special_keys += [conv.search_to_feature_key('votelast')
                     ] * text.count('vote' + last.lower())
    special_keys += [conv.search_to_feature_key('forstate')
                     ] * text.count('for ' + state.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('reelect ' + name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('reelect ' + last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re elect ' + name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re elect ' + last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re-elect ' + name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re-elect ' + last.lower())
    special_keys += [conv.search_to_feature_key('electlast')
                     ] * text.count('elect ' + name.lower())
    special_keys += [conv.search_to_feature_key('electlast')
                     ] * text.count('elect ' + last.lower())
    special_keys += [conv.search_to_feature_key('votelast')
                     ] * text.count('vote ' + last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')
                     ] * text.count('vote for ' + last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')
                     ] * text.count('votefor' + last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')
                     ] * text.count('vote4' + last.lower())
    text.replace(name.lower(), '')
    text.replace(last.lower(), '')
    text.replace(first.lower(), '')
    special_keys += [
        conv.search_to_feature_key('electoral_district_type')
    ] * sum(
        text.count(edt.lower())
        for edt in conv.district_type_dict[electoral_district_type])
    special_keys += [conv.search_to_feature_key('officename')] * sum(
        text.count(on.lower()) for on in conv.office_names)
    special_keys += [conv.search_to_feature_key('electoral_district_name')
                     ] * text.count(electoral_district_name.lower())
    special_keys += [conv.search_to_feature_key('state')] * text.count(
        state.lower())
    name_key = conv.search_to_feature_key('name')
    last_key = conv.search_to_feature_key('last')
    first_key = conv.search_to_feature_key('first')
    #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key)
    return basic_analyze(text) + special_keys
def getlinks(candidate, webpage, state, district_type, district_name):
    district_type = district_type.replace('_',' ').strip()
    state = state_map[state.strip()]
    candidate, last, first = conversions.clean_name(candidate)
    candidate = '+'.join(candidate.split(' '))
    print candidate
    state = '+'.join(state.split(' '))
    district_type = '+'.join(district_type.split(' '))
    district_name = '+'.join(district_name.strip().split(' '))
    search_urls = []
    extra_children_searches = []
    precise_searches = []
    search_urls.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}'.format(name=candidate, state=state))
    extra_children_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+info'.format(name=candidate, state=state))
    extra_children_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+sk=info'.format(name=candidate, state=state))
    precise_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+campaign'.format(name=candidate, state=state))
    precise_searches.append(u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+elect'.format(name=candidate, state=state))
    search_urls = [s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in search_urls]
    extra_children_searches = [s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in extra_children_searches]
    precise_searches = [s.encode(chardet.detect(s.encode('utf-8'))['encoding']) for s in precise_searches]
    old_webpage = webpage
    if webpage != 'www.gernensamples.com':
        webpage = conversions.get_redirect(webpage)
    #if webpage == '404' or webpage == 'ERROR':
        #raise Exception
    websites = []
    webpage_stripped = re.match(r'(?:https?://)?(?:www\.)?(?P<content>.+)',webpage).groupdict()['content'].rstrip('/')
    old_webpage_stripped = re.match(r'(?:https?://)?(?:www\.)?(?P<content>.+)',old_webpage).groupdict()['content'].rstrip('/')
    #TODO strip queries
    webpage_no_queries = ul.urlparse.urlparse(webpage)
    webpage_no_queries = re.match(r'(?:www\.)?(?P<content>.+)',webpage_no_queries.netloc + webpage_no_queries.path).groupdict()['content'].rstrip('/')
    old_webpage_no_queries = ul.urlparse.urlparse(old_webpage)
    old_webpage_no_queries = re.match(r'(?:www\.)?(?P<content>.+)',old_webpage_no_queries.netloc + old_webpage_no_queries.path).groupdict()['content'].rstrip('/')
    patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format(webpage=webpage_stripped.lower()))
    old_patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format(webpage=old_webpage_stripped.lower()))
    child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format(webpage=webpage_no_queries.lower()))
    old_child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format(webpage=old_webpage_no_queries.lower()))
    n = 4
    while True:
        results = map(lambda x: json.loads(requests.get(x).text),search_urls)
        if any(map(lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503),results)):
            print 'sleeping'
            time.sleep(n + random.randint(1,1000)/1000.)
            n = n*2
        elif any(map(lambda r: r.has_key('error'), results)):
            raise Exception(', '.join(map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'),results))))
        else:
            break
    n = 4
    while True:
        child_results = map(lambda x: json.loads(requests.get(x).text),extra_children_searches)
        if any(map(lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503),child_results)):
            print 'sleeping'
            time.sleep(n + random.randint(1,1000)/1000.)
            n = n*2
        elif any(map(lambda r: r.has_key('error'), child_results)):
            raise Exception(', '.join(map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'),child_results))))
        else:
            break
    n = 4
    while True:
        precise_results = map(lambda x: json.loads(requests.get(x).text),precise_searches)
        if any(map(lambda r: r.has_key('error') and (r['error']['code'] == 403 or r['error']['code'] == 503),precise_results)):
            print 'sleeping'
            time.sleep(n + random.randint(1,1000)/1000.)
            n = n*2
        elif any(map(lambda r: r.has_key('error'), precise_results)):
            raise Exception(', '.join(map(lambda r: r['error']['message'], filter(lambda r: r.has_key('error'),precise_results))))
        else:
            break

    if type(results) != list:
        print type(results)
        results = [results]
    real_results = [(r if r.has_key('items') else {'items':[]}) for r in results]
    results = real_results
    search_links = [[i['link'].lower() for i in r['items']] for r in results]
    search_text = [[u'{title} {link} {pagemap} {snippet}'.format(**convert_pagemap_dict(i)).lower().encode('utf-8') for i in r['items']] for r in results]
    for ri in range(len(search_links)):
        for si in range(len(search_links[ri])):
            for r in precise_results:
                if r.has_key('items'):
                    for i in r['items']:
                        if conversions.child_or_equal_page(search_links[ri][si], i['link'].lower(), True):
                            search_text[ri][si] += ' bipspecialappearsinprecise'
    child_links = [i['link'].lower() for r in child_results if r.has_key('items') for i in r['items']]
    child_text = [u'{title} {link} {pagemap} {snippet}'.format(**convert_pagemap_dict(i)).lower().encode('utf-8') for r in child_results if r.has_key('items') for i in r['items']]
    #search_text = [[u'{title} {link} {pagemap} {snippet}'.format(**i).lower().encode('utf-8') for i in r['items']] for r in results]
    search_class = [map(lambda s: conversions.page_relation(s, True, webpage,old_webpage),sl) for sl in search_links]
    #search_class = [map(lambda s: 'True' if patt.match(s) != None or old_patt.match(s) != None else ('Child' if child_patt.match(s) != None or old_child_patt.match(s) != None else 'False'),sl) for sl in search_links]
    #print search_text
    #TODO Clean up ssv code
    ssv = [any(map(patt.match,sl)) or any(map(old_patt.match,sl)) for sl in search_links]
    non_websites = [[i['link'] for i in r['items'] if webpage not in i['link']] for r in results]
    cs,ct,cc = zip(*[combine_children(search_links[i],search_text[i],search_class[i], child_links, child_text) for i in range(len(search_links))])
    print 'got there',len(results[0]['items'])
    return non_websites, ssv, webpage_stripped, search_links, search_text, [r['items'] for r in results], search_class, cs, ct, cc,child_links,child_text
def analyze(s):
    d = eval(s)
    special_keys = []
    name = d['name']
    electoral_district_type = d['electoral_district_type']
    electoral_district_name = d['electoral_district_name']
    state = d['state']
    link = d['link']
    text = d['sitetext'].lower().decode('utf-8')
    name, last, first = conv.clean_name(name)
    for v in vocabulary:
        special_keys += [conv.search_to_feature_key(v)] * text.count(v.lower())
        text.replace(v.lower(), '')
    special_keys += [conv.search_to_feature_key('name')] * text.count(
        name.lower())
    special_keys += [conv.search_to_feature_key('last')] * text.count(
        last.lower())
    special_keys += [conv.search_to_feature_key('first')] * text.count(
        first.lower())
    special_keys += [conv.search_to_feature_key('lastfor')
                     ] * text.count(last.lower() + ' for')
    special_keys += [conv.search_to_feature_key('lastfor')
                     ] * text.count(last.lower() + 'for')
    special_keys += [conv.search_to_feature_key('lastfor')
                     ] * text.count(last.lower() + '4')
    special_keys += [conv.search_to_feature_key('votelast')
                     ] * text.count('vote' + last.lower())
    special_keys += [conv.search_to_feature_key('forstate')
                     ] * text.count('for ' + state.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('reelect ' + name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('reelect ' + last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re elect ' + name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re elect ' + last.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re-elect ' + name.lower())
    special_keys += [conv.search_to_feature_key('reelectlast')
                     ] * text.count('re-elect ' + last.lower())
    special_keys += [conv.search_to_feature_key('electlast')
                     ] * text.count('elect ' + name.lower())
    special_keys += [conv.search_to_feature_key('electlast')
                     ] * text.count('elect ' + last.lower())
    special_keys += [conv.search_to_feature_key('votelast')
                     ] * text.count('vote ' + last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')
                     ] * text.count('vote for ' + last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')
                     ] * text.count('votefor' + last.lower())
    special_keys += [conv.search_to_feature_key('voteforlast')
                     ] * text.count('vote4' + last.lower())
    try:
        special_keys += [
            conv.search_to_feature_key('politicianpublicfigure')
        ] * len(
            re.findall(
                r'{last}.{{1,50}}(?:public figure|politician)'.format(
                    last=re.escape(last.encode('utf-8'))), text))
    except:
        import pdb
        pdb.set_trace()
    text.replace(name.lower(), '')
    text.replace(last.lower(), '')
    text.replace(first.lower(), '')
    special_keys += [
        conv.search_to_feature_key('electoral_district_type')
    ] * sum(
        text.count(edt.lower())
        for edt in conv.district_type_dict[electoral_district_type])
    special_keys += [conv.search_to_feature_key('officename')] * sum(
        text.count(on.lower()) for on in conv.office_names)
    special_keys += [conv.search_to_feature_key('electoral_district_name')
                     ] * text.count(electoral_district_name.lower())
    special_keys += [conv.search_to_feature_key('state')] * text.count(
        state.lower())
    if fb_page_data.has_key(conv.strip_and_std(link)):
        fb_page_dict = fb_page_data[conv.strip_and_std(link)]
        special_keys.append(conv.search_to_feature_key('fbdata'))
        fans = int(math.log(int(fb_page_dict['fans'])))
        special_keys += [conv.search_to_feature_key('fbdata')] * fans
        if fb_page_dict['authentic'] == 'Authentic':
            special_keys.append(conv.search_to_feature_key('fbauthentic'))
    name_key = conv.search_to_feature_key('name')
    last_key = conv.search_to_feature_key('last')
    first_key = conv.search_to_feature_key('first')
    #print 'name keys ',special_keys.count(name_key),'last keys ', special_keys.count(last_key), 'first keys ', special_keys.count(first_key)
    return basic_analyze(text) + special_keys
    "fb/Gubernatorial Races.csv"
) as gr, open("fb/Presidential Race.csv") as pr, open("fb/fbcands.csv") as fc, open("fb/morefbcands.csv", "w") as mfc:
    csvhr = csv.DictReader(hr)
    csvsr = csv.DictReader(sr)
    csvgr = csv.DictReader(gr)
    csvpr = csv.DictReader(pr)
    csvfc = csv.DictReader(fc)
    csvmfc = csv.DictWriter(mfc, csvfc.fieldnames)
    hrdict = {}
    for l in csvhr:
        hrdict.update(
            {
                (
                    re.match(r"(?P<state>\w{2})-(?P<number>\d+)", l["DISTRICT"]).groupdict()["state"],
                    int(re.match(r"(?P<state>\w{2})-(?P<number>\d+)", l["DISTRICT"]).groupdict()["number"]),
                    conversions.clean_name(l["CANDIDATE"]),
                ): l["URL"].replace("?ref=ts", "")
            }
        )
    srdict = dict(
        ((l["STATE"], conversions.clean_name(l["CANDIDATE"])), l["URL"].replace("?ref=ts", "")) for l in csvsr
    )
    grdict = dict(
        ((l["STATE"], conversions.clean_name(l["CANDIDATE"])), l["URL"].replace("?ref=ts", "")) for l in csvgr
    )
    csvmfc.writeheader()
    for l in csvfc:
        try:
            hrkey = (l["state"], int(l["electoral_district_name"]), conversions.clean_name(l["name"]))
        except:
            hrkey = (l["state"], l["electoral_district_name"], conversions.clean_name(l["name"]))
def getlinks(candidate, webpage, state, district_type, district_name):
    """
    Gets all the facebook links found via the Google Search API
    """

    # ### Cleanup input variables

    # District
    district_type = district_type.replace('_', ' ').strip()
    district_type = '+'.join(district_type.split(' '))
    district_name = '+'.join(district_name.strip().split(' '))

    # State
    state = state_map[state.strip()]
    state = '+'.join(state.split(' '))

    # Candidate name
    candidate, last, first = conversions.clean_name(candidate)
    candidate = '+'.join(candidate.split(' '))
    #print 'CANDIDATE: {}'.format(candidate)

    # Setup search urls
    search_urls = []
    extra_children_searches = []
    precise_searches = []

    # Common values
    url = "https://www.googleapis.com/customsearch/v1"
    cx = "011743744063680272768:cp4-iesopjm"
    key = "AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA"

    search_urls.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    # Just searches for general about pages
    extra_children_searches.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+info'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    # sk=info specifies Facebook's about page
    extra_children_searches.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+sk=info'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    precise_searches.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+campaign'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    precise_searches.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+elect'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    # Clean up encoding of URL's
    search_urls = [
        s.encode(
            chardet.detect(s.encode('utf-8'))['encoding']
        ) for s in search_urls
    ]

    extra_children_searches = [
        s.encode(
            chardet.detect(s.encode('utf-8'))['encoding']
        ) for s in extra_children_searches
    ]

    #print 'SEARCH_URLS: {}'.format(search_urls)

    precise_searches = [
        s.encode(
            chardet.detect(s.encode('utf-8'))['encoding']
        ) for s in precise_searches
    ]

    # This must be  a test for a dummy webside used for testing
    # get_redirect simply gets the final page that returns a 200
    old_webpage = webpage
    if webpage != 'www.gernensamples.com':
        webpage = conversions.get_redirect(webpage)

    #print 'WBBPAGES: {}'.format(webpage)

    has_webpage = True
    #    raise Exception  # why do we need this exception??
    # print 'ok?'
    # Cleanup web pages by removing protocol, subdomain, and trailing '/'

    if has_webpage:
        #print has_webpage
        webpage_stripped = re.match(
            r'(?:https?://)?(?:www\.)?(?P<content>.+)', webpage
        ).groupdict()['content'].rstrip('/')

        old_webpage_stripped = re.match(
            r'(?:https?://)?(?:www\.)?(?P<content>.+)', old_webpage
        ).groupdict()['content'].rstrip('/')

        # TODO strip queries
        webpage_no_queries = ul.urlparse.urlparse(webpage)
        webpage_no_queries = re.match(
            r'(?:www\.)?(?P<content>.+)',
            webpage_no_queries.netloc + webpage_no_queries.path
        ).groupdict()['content'].rstrip('/')

        old_webpage_no_queries = ul.urlparse.urlparse(old_webpage)
        
        #print 'NO:{}'.format(old_webpage_no_queries)
        if old_webpage_no_queries is not None:
            old_webpage_no_queries = re.match(
                r'(?:www\.)?(?P<content>.+)',
                old_webpage_no_queries.netloc + old_webpage_no_queries.path
            ).groupdict()['content'].rstrip('/')

        patt = re.compile(
            r'^https?://(?:www.)?{webpage}/?$'.format(
                webpage=webpage_stripped.lower()
            )
        )
        old_patt = re.compile(
            r'^https?://(?:www.)?{webpage}/?$'.format(
                webpage=old_webpage_stripped.lower()
            )
        )

        child_patt = re.compile(
            r'^https?://(?:www\.)?{webpage}.+'.format(
                webpage=webpage_no_queries.lower()
            )
        )

        old_child_patt = re.compile(
            r'^https?://(?:www\.)?{webpage}.+'.format(
                webpage=old_webpage_no_queries.lower()
            )
        )

    print 'starting'
    n = 4
    while True:
        results = map(lambda x: json.loads(requests.get(x).text), search_urls)
        #for r in results:
        #    print 'error' in r
        if any(map(
                lambda r: ('error' in r and (
                    r['error']['code'] == 403 or r['error']['code'] == 503)
                ), results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000)/1000.)
            n = n*2
        elif any(map(lambda r: 'error' in r, results)):
            raise Exception(', '.join(
                map(
                    lambda r: r['error']['message'],
                    filter(lambda r: 'error' in r, results)
                )
            ))
        else:
            break

    n = 4
    while True:
        child_results = map(
            lambda x: json.loads(requests.get(x).text),
            extra_children_searches
        )
        if any(map(
                lambda r: 'error' in r and (
                    r['error']['code'] == 403 or r['error']['code'] == 503
                ), child_results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000) / 1000.)
            n = n * 2
        elif any(map(
            lambda r: 'error' in r, child_results
        )):
            raise Exception(', '.join(
                map(
                    lambda r: r['error']['message'],
                    filter(lambda r: 'error' in r, child_results)
                )
            ))
        else:
            break

    n = 4
    while True:
        precise_results = map(
            lambda x: json.loads(requests.get(x).text), precise_searches
        )
        if any(map(
            lambda r: 'error' in r and (
                r['error']['code'] == 403 or r['error']['code'] == 503
                ), precise_results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000) / 1000.)
            n = n * 2
        elif any(map(lambda r: 'error' in r, precise_results)):
            raise Exception(', '.join(
                map(
                    lambda r: r['error']['message'],
                    filter(lambda r: 'error' in r, precise_results)
                )
            ))
        else:
            break

  
    if type(results) != list:
        results = [results]

    # Get results from the "items" key and store it in the results variable
    real_results = [
        (r if 'items' in r else {'items': []}) for r in results
    ]
    results = real_results

    # print 'RESULTS:{}'.format(results)
    # Get the result URLs, Extract searchable text from the pagemap
    search_links = [[i['link'].lower() for i in r['items']] for r in results]
    search_text = [
        [u'{title} {link} {pagemap} {snippet}'.format(
            **convert_pagemap_dict(i)
        ).lower().encode('utf-8') for i in r['items']] for r in results
    ]

    # first loop may be unneccessary
    for ri in range(len(search_links)):  # for 1 to number of result objects
        for si in range(len(search_links[ri])):  # for 1 to number of links
            # For each "precise result" (name+state+'elect'),
            # see if the link is equivalent
            # or a sub page of the main results (name+state)
            for r in precise_results:
                if 'items' in r:
                    for i in r['items']:
                        if conversions.child_or_equal_page(
                            search_links[ri][si], i['link'].lower(), True
                        ):
                            search_text[ri][si] += ' bipspecialappearsinprecise'  # noqa

    # Get the result URLs, Extract searchable text from the pagemap
    child_links = [
        i['link'].lower() for r in child_results if 'items' in r
        for i in r['items']
    ]

    child_text = [
        u'{title} {link} {pagemap} {snippet}'.format(
            **convert_pagemap_dict(i)
        ).lower().encode('utf-8') for r in child_results if 'items' in r
        for i in r['items']
    ]

    # Classify each search link based on it's relationship
    # to the provided web page, either PARENT, CHILD, TRUE (identity),
    # or FALSE (no match)
    search_class = [
        map(lambda s: conversions.page_relation(
            s, True, webpage, old_webpage
        ), sl) for sl in search_links
    ]

    # TODO Clean up ssv code

    # Seems to match each search link result against the webpage domain
    ssv = [
        any(map(patt.match, sl)) or any(map(old_patt.match, sl))
        for sl in search_links
    ]

    non_websites = [
        [i['link'] for i in r['items'] if webpage not in i['link']]
        for r in results
    ]

    cs, ct, cc = zip(
        *[combine_children(
            search_links[i], search_text[i], search_class[i],
            child_links, child_text
        ) for i in range(len(search_links))]
    )

    print 'got there', len(results[0]['items'])

    return (non_websites, ssv, webpage_stripped, search_links, search_text,
            [r['items'] for r in results], search_class, cs, ct, cc,
            child_links, child_text)
def getlinks(candidate, webpage, state, district_type, district_name):

    # District
    district_type = district_type.replace('_', ' ').strip()
    district_type = '+'.join(district_type.split(' '))
    district_name = '+'.join(district_name.strip().split(' '))

    # State
    state = state_map[state.strip()]
    state = '+'.join(state.split(' '))

    # Candidate Name
    candidate, last, first = conversions.clean_name(candidate)
    candidate = '+'.join(candidate.split(' '))
    #print candidate

    # Search URLs
    search_urls = []
    precise_searches = []


    url = 'https://www.googleapis.com/customsearch/v1'
    cx = '011743744063680272768:xcugk1a_1t0'
    #cx = '009761440872559920339:eqjjlrdgzma'
    key = 'AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl'

    # Create search URLs
    search_urls.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    precise_searches.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+campaign'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    precise_searches.append(
        u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+elect'.format(
            url=url, cx=cx, key=key, name=candidate, state=state
        )
    )

    # URL Encoding Cleanup
    search_urls = [
        s.encode(
            chardet.detect(s.encode('utf-8'))['encoding']
        ) for s in search_urls
    ]

    precise_searches = [
        s.encode(
            chardet.detect(s.encode('utf-8'))['encoding']
        ) for s in precise_searches
    ]

    # ?? Some sort of test?
    webpage = conversions.twitter_handle_to_web(webpage)
    print webpage
    old_webpage = webpage


    if webpage != 'www.gernensamples.com':
        webpage = conversions.get_redirect(webpage)

    # if webpage == '404' or webpage == 'ERROR':
        # raise Exception

    #print search_urls
    #print precise_searches
    webpage_stripped = re.match(
        r'(?:https?://)?(?:www\.)?(?P<content>.+)', webpage
    ).groupdict()['content'].rstrip('/')

    old_webpage_stripped = re.match(
        r'(?:https?://)?(?:www\.)?(?P<content>.+)', old_webpage
    ).groupdict()['content'].rstrip('/')

    # TODO strip queries
    webpage_no_queries = ul.urlparse.urlparse(webpage)
    webpage_no_queries = re.match(
        r'(?:www\.)?(?P<content>.+)',
        webpage_no_queries.netloc + webpage_no_queries.path
    ).groupdict()['content'].rstrip('/')

    old_webpage_no_queries = ul.urlparse.urlparse(old_webpage)
    old_webpage_no_queries = re.match(
        r'(?:www\.)?(?P<content>.+)',
        old_webpage_no_queries.netloc + old_webpage_no_queries.path
    ).groupdict()['content'].rstrip('/')

    patt = re.compile(
        r'^https?://(?:www.)?{webpage}/?$'.format(
            webpage=webpage_stripped.lower()
        )
    )

    old_patt = re.compile(
        r'^https?://(?:www.)?{webpage}/?$'.format(
            webpage=old_webpage_stripped.lower()
        )
    )

    print 'searching'
    # Timeout work
    n = 4
    while True:
        results = map(lambda x: json.loads(requests.get(x).text), search_urls)
        if any(map(
                lambda r: 'error' in r and (
                    r['error']['code'] == 403 or r['error']['code'] == 503
                ), results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000)/1000.)
            n = n*2
        elif any(map(
                lambda r: 'error' in r, results)):
            raise Exception(', '.join(
                map(lambda r: r['error']['message'],
                    filter(lambda r: 'error' in r, results))
            ))
        else:
            break

    n = 4
    while True:
        precise_results = map(
            lambda x: json.loads(requests.get(x).text), precise_searches
        )
        if any(map(
                lambda r: 'error' in r and (
                    r['error']['code'] == 403 or r['error']['code'] == 503
                ), precise_results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000)/1000.)
            n = n*2
        elif any(map(lambda r: 'error' in r, precise_results)):
            raise Exception(', '.join(
                map(lambda r: r['error']['message'],
                    filter(lambda r: 'error' in r, precise_results))
            ))
        else:
            break
    print 'done searching'

    if type(results) != list:
        results = [results]

    # Get results
    real_results = [(r if 'items' in r else {'items': []}) for r in results]
    results = real_results

    search_links = [
        [conversions.clean_twitter(i['link'].lower()) for i in r['items']]
        for r in results
    ]

    search_text = [
        [u'{title} {link} {pagemap} {snippet}'.format(
            **convert_pagemap_dict(i)
        ).lower().encode('utf-8') for i in r['items']] for r in results
    ]

    for ri in range(len(search_links)):
        for si in range(len(search_links[ri])):
            for r in precise_results:
                if 'items' in r:
                    for i in r['items']:
                        if conversions.child_or_equal_page(
                                search_links[ri][si],
                                conversions.clean_twitter(i['link'].lower()),
                                True):
                            search_text[ri][si] += ' bipspecialappearsinprecise'

    child_links = []
    child_text = []
    search_class = [map(lambda s: conversions.page_relation(
        s, False, webpage, old_webpage), sl) for sl in search_links
    ]

    # TODO Clean up ssv code
    ssv = [
        any(map(patt.match, sl)) or any(map(old_patt.match, sl))
        for sl in search_links
    ]

    non_websites = [
        [
            i['link'] for i in r['items'] if webpage not in i['link']
        ] for r in results
    ]

    cs, ct, cc = zip(*[combine_children(
        search_links[i],
        search_text[i],
        search_class[i],
        child_links,
        child_text) for i in range(len(search_links))
        ]
    )

    print 'got there', len(results[0]['items'])
    return (non_websites, ssv, webpage_stripped, search_links, search_text,
            [r['items'] for r in results], search_class,
            cs, ct, cc, child_links, child_text)
Example #9
0
     'fb/Presidential Race.csv') as pr, open('fb/fbcands.csv') as fc, open(
         'fb/morefbcands.csv', 'w') as mfc:
 csvhr = csv.DictReader(hr)
 csvsr = csv.DictReader(sr)
 csvgr = csv.DictReader(gr)
 csvpr = csv.DictReader(pr)
 csvfc = csv.DictReader(fc)
 csvmfc = csv.DictWriter(mfc, csvfc.fieldnames)
 hrdict = {}
 for l in csvhr:
     hrdict.update({
         (re.match(r'(?P<state>\w{2})-(?P<number>\d+)', l['DISTRICT']).groupdict(
          )['state'],
          int(
              re.match(r'(?P<state>\w{2})-(?P<number>\d+)', l['DISTRICT']).groupdict(
              )['number']), conversions.clean_name(l['CANDIDATE'])):
         l['URL'].replace('?ref=ts', '')
     })
 srdict = dict(((l['STATE'], conversions.clean_name(l['CANDIDATE'])),
                l['URL'].replace('?ref=ts', '')) for l in csvsr)
 grdict = dict(((l['STATE'], conversions.clean_name(l['CANDIDATE'])),
                l['URL'].replace('?ref=ts', '')) for l in csvgr)
 csvmfc.writeheader()
 for l in csvfc:
     try:
         hrkey = (l['state'], int(l['electoral_district_name']),
                  conversions.clean_name(l['name']))
     except:
         hrkey = (l['state'], l['electoral_district_name'],
                  conversions.clean_name(l['name']))
     srkey = (l['state'], conversions.clean_name(l['name']))
Example #10
0
def getlinks(candidate, webpage, state, district_type, district_name):
    district_type = district_type.replace('_', ' ').strip()
    state = state_map[state.strip()]
    candidate, last, first = conversions.clean_name(candidate)
    candidate = '+'.join(candidate.split(' '))
    print candidate
    state = '+'.join(state.split(' '))
    district_type = '+'.join(district_type.split(' '))
    district_name = '+'.join(district_name.strip().split(' '))
    search_urls = []
    extra_children_searches = []
    precise_searches = []
    search_urls.append(
        u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}'
        .format(name=candidate, state=state))
    extra_children_searches.append(
        u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+info'
        .format(name=candidate, state=state))
    extra_children_searches.append(
        u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+sk=info'
        .format(name=candidate, state=state))
    precise_searches.append(
        u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+campaign'
        .format(name=candidate, state=state))
    precise_searches.append(
        u'https://www.googleapis.com/customsearch/v1?cx=011743744063680272768:cp4-iesopjm&key=AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA&hl=en&q={name}+{state}+elect'
        .format(name=candidate, state=state))
    search_urls = [
        s.encode(chardet.detect(s.encode('utf-8'))['encoding'])
        for s in search_urls
    ]
    extra_children_searches = [
        s.encode(chardet.detect(s.encode('utf-8'))['encoding'])
        for s in extra_children_searches
    ]
    precise_searches = [
        s.encode(chardet.detect(s.encode('utf-8'))['encoding'])
        for s in precise_searches
    ]
    old_webpage = webpage
    if webpage != 'www.gernensamples.com':
        webpage = conversions.get_redirect(webpage)
    #if webpage == '404' or webpage == 'ERROR':
    #raise Exception
    websites = []
    webpage_stripped = re.match(r'(?:https?://)?(?:www\.)?(?P<content>.+)',
                                webpage).groupdict()['content'].rstrip('/')
    old_webpage_stripped = re.match(
        r'(?:https?://)?(?:www\.)?(?P<content>.+)',
        old_webpage).groupdict()['content'].rstrip('/')
    #TODO strip queries
    webpage_no_queries = ul.urlparse.urlparse(webpage)
    webpage_no_queries = re.match(
        r'(?:www\.)?(?P<content>.+)', webpage_no_queries.netloc +
        webpage_no_queries.path).groupdict()['content'].rstrip('/')
    old_webpage_no_queries = ul.urlparse.urlparse(old_webpage)
    old_webpage_no_queries = re.match(
        r'(?:www\.)?(?P<content>.+)', old_webpage_no_queries.netloc +
        old_webpage_no_queries.path).groupdict()['content'].rstrip('/')
    patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format(
        webpage=webpage_stripped.lower()))
    old_patt = re.compile(r'^https?://(?:www.)?{webpage}/?$'.format(
        webpage=old_webpage_stripped.lower()))
    child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format(
        webpage=webpage_no_queries.lower()))
    old_child_patt = re.compile(r'^https?://(?:www\.)?{webpage}.+'.format(
        webpage=old_webpage_no_queries.lower()))
    n = 4
    while True:
        results = map(lambda x: json.loads(requests.get(x).text), search_urls)
        if any(
                map(
                    lambda r: r.has_key('error') and
                    (r['error']['code'] == 403 or r['error']['code'] == 503),
                    results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000) / 1000.)
            n = n * 2
        elif any(map(lambda r: r.has_key('error'), results)):
            raise Exception(', '.join(
                map(lambda r: r['error']['message'],
                    filter(lambda r: r.has_key('error'), results))))
        else:
            break
    n = 4
    while True:
        child_results = map(lambda x: json.loads(requests.get(x).text),
                            extra_children_searches)
        if any(
                map(
                    lambda r: r.has_key('error') and
                    (r['error']['code'] == 403 or r['error']['code'] == 503),
                    child_results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000) / 1000.)
            n = n * 2
        elif any(map(lambda r: r.has_key('error'), child_results)):
            raise Exception(', '.join(
                map(lambda r: r['error']['message'],
                    filter(lambda r: r.has_key('error'), child_results))))
        else:
            break
    n = 4
    while True:
        precise_results = map(lambda x: json.loads(requests.get(x).text),
                              precise_searches)
        if any(
                map(
                    lambda r: r.has_key('error') and
                    (r['error']['code'] == 403 or r['error']['code'] == 503),
                    precise_results)):
            print 'sleeping'
            time.sleep(n + random.randint(1, 1000) / 1000.)
            n = n * 2
        elif any(map(lambda r: r.has_key('error'), precise_results)):
            raise Exception(', '.join(
                map(lambda r: r['error']['message'],
                    filter(lambda r: r.has_key('error'), precise_results))))
        else:
            break

    if type(results) != list:
        print type(results)
        results = [results]
    real_results = [(r if r.has_key('items') else {
        'items': []
    }) for r in results]
    results = real_results
    search_links = [[i['link'].lower() for i in r['items']] for r in results]
    search_text = [[
        u'{title} {link} {pagemap} {snippet}'.format(
            **convert_pagemap_dict(i)).lower().encode('utf-8')
        for i in r['items']
    ] for r in results]
    for ri in range(len(search_links)):
        for si in range(len(search_links[ri])):
            for r in precise_results:
                if r.has_key('items'):
                    for i in r['items']:
                        if conversions.child_or_equal_page(
                                search_links[ri][si], i['link'].lower(), True):
                            search_text[ri][
                                si] += ' bipspecialappearsinprecise'
    child_links = [
        i['link'].lower() for r in child_results if r.has_key('items')
        for i in r['items']
    ]
    child_text = [
        u'{title} {link} {pagemap} {snippet}'.format(
            **convert_pagemap_dict(i)).lower().encode('utf-8')
        for r in child_results if r.has_key('items') for i in r['items']
    ]
    #search_text = [[u'{title} {link} {pagemap} {snippet}'.format(**i).lower().encode('utf-8') for i in r['items']] for r in results]
    search_class = [
        map(lambda s: conversions.page_relation(s, True, webpage, old_webpage),
            sl) for sl in search_links
    ]
    #search_class = [map(lambda s: 'True' if patt.match(s) != None or old_patt.match(s) != None else ('Child' if child_patt.match(s) != None or old_child_patt.match(s) != None else 'False'),sl) for sl in search_links]
    #print search_text
    #TODO Clean up ssv code
    ssv = [
        any(map(patt.match, sl)) or any(map(old_patt.match, sl))
        for sl in search_links
    ]
    non_websites = [[
        i['link'] for i in r['items'] if webpage not in i['link']
    ] for r in results]
    cs, ct, cc = zip(*[
        combine_children(search_links[i], search_text[i], search_class[i],
                         child_links, child_text)
        for i in range(len(search_links))
    ])
    print 'got there', len(results[0]['items'])
    return non_websites, ssv, webpage_stripped, search_links, search_text, [
        r['items'] for r in results
    ], search_class, cs, ct, cc, child_links, child_text