Exemple #1
0
    def initialize(self,
                   proxies_per_proto={},
                   user=None,
                   passw=None,
                   debug=False):

        print 'INIT: TwitterBot'

        self.__br = Browser()
        self.__br.set_proxies(proxies_per_proto)
        self.__br.set_debug_http(debug)
        self.__debug = debug
        self.__sandman = SandMan('TwitterBot')

        self.__ngd = NGD()
        self.__ngd.set_proxies(proxies_per_proto)

        self.__lock = Lock()
        try:
            # sign in
            self.__br.open("http://twitter.com/")
            self.__br.select_form(nr=1)
            self.__br['session[username_or_email]'] = user
            self.__br['session[password]'] = passw
            resp = self.__br.submit()
            time.sleep(0.2)

        except Exception, e:
            if self.__debug:
                traceback.print_exc(file=sys.stdout)
                print str(e)
            print 'EXCEPTION on TwitterBot, possibly bad user/password or https login don\' work behind a proxy.'
Exemple #2
0
class TwitterBot:

    __favorites_regex = '<a href="http://twitter.com/[a-zA-Z0-9_]+" title="[a-zA-Z0-9\-_ .]+">[a-zA-Z0-9_]+</a>'

    __following_regex = '<a href="http://twitter.com/[a-zA-Z0-9_]+" rel="contact"><img alt="[a-zA-Z0-9\-_ .]+" class'

    __fav_complete_name_regex = 'title="[a-zA-Z0-9\-_ .]+">'
    __fav_complete_name_prefix = 'title="'
    __fav_complete_name_sufix = '">'

    __foll_complete_name_regex = 'img alt="[a-zA-Z0-9\-_ .]+" class'
    __foll_complete_name_prefix = 'img alt="'
    __foll_complete_name_sufix = '" class'

    __url_regex = 'href="http://twitter.com/[a-zA-Z0-9_]+" '
    __url_prefix = 'href="'
    __url_sufix = '" '

    def __init__(self):
        pass

    def initialize(self,
                   proxies_per_proto={},
                   user=None,
                   passw=None,
                   debug=False):

        print 'INIT: TwitterBot'

        self.__br = Browser()
        self.__br.set_proxies(proxies_per_proto)
        self.__br.set_debug_http(debug)
        self.__debug = debug
        self.__sandman = SandMan('TwitterBot')

        self.__ngd = NGD()
        self.__ngd.set_proxies(proxies_per_proto)

        self.__lock = Lock()
        try:
            # sign in
            self.__br.open("http://twitter.com/")
            self.__br.select_form(nr=1)
            self.__br['session[username_or_email]'] = user
            self.__br['session[password]'] = passw
            resp = self.__br.submit()
            time.sleep(0.2)

        except Exception, e:
            if self.__debug:
                traceback.print_exc(file=sys.stdout)
                print str(e)
            print 'EXCEPTION on TwitterBot, possibly bad user/password or https login don\' work behind a proxy.'
    def initialize(self,
                   proxies_per_proto={},
                   user=None,
                   passw=None,
                   debug=False):

        print 'INIT: SearchEngineBot'

        self.__br = Browser()
        self.__br.set_proxies(proxies_per_proto)
        self.__br.set_debug_http(debug)
        self.__ngd = NGD(proxies_per_proto)
        self.__harvest_command = EmailHarvestingCommand()
        self.__harvest_command.set_only_complete_names(False)

        self.__sandman = SandMan('SearchEngineBot')
Exemple #4
0
    def __init__(self,
                 proxies={},
                 entropy_filter=True,
                 lang='en',
                 entropy_top=3,
                 query_top=100,
                 fraction=5):
        self.__ngd = NGD(proxies)
        #self.__ngd.set_context('site:imsdb.com')
        self.__cache = {}
        self.__min_ent = 0.0
        self.__entropy_filter = entropy_filter
        self.__lang = lang
        self.__entropy_top = entropy_top
        self.__fraction = fraction
        self.__query_top = query_top
        self.__translator = Translate()

        self.__lock = Lock()
        self.__voc_translator = None

        random.seed(666)
Exemple #5
0
class ChatBot:
    def __init__(self,
                 proxies={},
                 entropy_filter=True,
                 lang='en',
                 entropy_top=3,
                 query_top=100,
                 fraction=5):
        self.__ngd = NGD(proxies)
        #self.__ngd.set_context('site:imsdb.com')
        self.__cache = {}
        self.__min_ent = 0.0
        self.__entropy_filter = entropy_filter
        self.__lang = lang
        self.__entropy_top = entropy_top
        self.__fraction = fraction
        self.__query_top = query_top
        self.__translator = Translate()

        self.__lock = Lock()
        self.__voc_translator = None

        random.seed(666)

    def set_voc_translator(self, voc_trans=None):
        self.__voc_translator = voc_trans

    def entropy_min(self, e_min):
        self.__min_ent = e_min

    def reply_to(self, chat_line):
        self.__lock.acquire()
        try:
            chat_line = normalize_token(chat_line)
            if self.__lang != 'en':
                chat_line = self.__translator.translate(
                    chat_line, self.__lang, 'en')
            snippets, answers = [], []
            while len(answers) == 0:
                snippets = self.__ngd.snippets_query(
                    '"%s" site:imsdb.com' % chat_line, self.__query_top)
                answers = self.__extract_answers(snippets, chat_line)
                if len(answers) == 0:
                    chat_line = chat_line[:-1]
                    if len(chat_line) == 0:
                        break
                    continue

            probabilities = self.__build_probs(answers)
            new_ans = []
            for i in range(min(len(answers), self.__fraction)):
                new_ans.append(self.__choose_random_answer(probabilities))
            answers = list(set(new_ans))

            new_answers = []
            for ans in answers:
                if self.__entropy_filter:
                    val = self.__ngd.distance(
                        ('"%s"' % chat_line, '"%s"' % ans.encode()))
                    if val:
                        print 'search engine distance (choosing response): %s %f' % (
                            ans, val)
                        time.sleep(0.25)
                        new_answers.append((ans, val))
            if self.__entropy_filter:
                new_answers.sort(second_compare)
                #new_answers.reverse()
                new_answers = map(lambda x: x[0],
                                  new_answers[:self.__entropy_top])
                answers = filter(lambda x: x in new_answers, answers)

            ans = None
            if len(answers) > 0:
                ans = answers[random.randint(0, len(answers) - 1)]

            if not ans: ans = 'ah'

            # use vocabulary translator, if available
            if self.__voc_translator:
                ans = self.__voc_translator(ans)

            if ans and self.__lang != 'en':
                ans = self.__translator.translate(ans, 'en',
                                                  self.__lang).lower()
            if not ans: ans = 'ah'
            return ans
        finally:
            self.__lock.release()  # release lock, no matter what

    def __extract_answer(self, snippet, chat_line):
        # [^\.!?]+
        snippet = normalize_token(snippet)
        snippet = re.sub('\([^\)]+\) ', '', snippet)
        snippet = re.sub('\[[^\)]+\] ', '', snippet)
        iterator = re.finditer('[A-Z][A-Z]+ [^\.!?]+[\.!?]', snippet)
        lines = []
        for match in iterator:
            line = match.group()
            #print line
            line_s = line.split(' ')
            line = ' '.join(line_s[1:]).lower()
            line = html2text(line)
            #print line
            line = line.replace('_', '').replace('\n', '')
            #line = re.sub( '\([^\)]+\) ', '', line)
            if not '-' in line and not ':' in line and not '**' in line and not '(' in line and not ')' in line and not '"' in line:
                if len(line) > 0 and line[-1] == '.':
                    line = line[:-1]
                lines.append(line)
            #ret.append(strip(match))
            #print strip(match)
        if len(lines) == 0:
            return ''
        prev = lines[0].lower()
        ret = []
        for i in range(1, len(lines)):
            if chat_line.lower() in prev:
                ret.append(lines[i].lower())
            prev = lines[i].lower()
        return ret

    def __extract_answers(self, snippets, chat_line):
        ret, ret_titles = [], []
        for snippet in snippets:
            anss = self.__extract_answer(snippet, chat_line)
            for ans in anss:
                if ans != '':
                    ret.append(ans.strip())
        return ret

    def __build_probs(self, answers):
        d = {}
        for ans in answers:
            if not ans in d:
                d[ans] = 1
            else:
                d[ans] += 1
        ret = []
        for ans, cnt in d.iteritems():
            ret.append((ans, float(cnt) / len(answers)))
        return ret

    def __choose_random_answer(self, probs):
        rand_float = random.random()
        sum = 0.0
        ret = None
        for ans, prob in probs:
            sum += prob
            if sum >= rand_float:
                ret = ans
                break
        return ret

    def start(self):
        msg = ''
        while msg != 'bye':
            msg = raw_input('You: ')
            ans, title = self.reply_to(msg.strip())
        print 'end of chat.'

    def save_cache(self):
        self.__ngd.save_cache()
Exemple #6
0
from distance import NGD, NMD, NYD

import time

google = NGD()
msn = NMD()
yahoo = NYD()

def compare( a, b , google, msn, yahoo):

    g = google.distance(a,b)
    m = msn.distance(a,b)
    y = yahoo.distance(a,b)

    print 'for "%s" "%s"' % (a,b)
    print 'google: %f msn: %f yahoo: %f' % (g,m,y)
    print ''

pairs = [('by','with'), ('quantum','physics'), ('quantum', 'football')]

print time.ctime()
b = time.time()
#print google.distances((pairs*30)[:21])
a = time.time()
print time.ctime()
print 'took %d seconds' % (a-b)
b = time.time()
print google.distances((pairs*30)[:65], True)
a = time.time()
print time.ctime()
class SearchEngineBot:
    def __init__(self):
        pass

    def initialize(self,
                   proxies_per_proto={},
                   user=None,
                   passw=None,
                   debug=False):

        print 'INIT: SearchEngineBot'

        self.__br = Browser()
        self.__br.set_proxies(proxies_per_proto)
        self.__br.set_debug_http(debug)
        self.__ngd = NGD(proxies_per_proto)
        self.__harvest_command = EmailHarvestingCommand()
        self.__harvest_command.set_only_complete_names(False)

        self.__sandman = SandMan('SearchEngineBot')

        # no sign in

    def set_proxies_per_proto(self, proxies):
        self.__proxies = proxies
        try:
            self.__ngd.set_proxies(proxies)
        except:
            print 'EXCEPTION on SeachEngineBot, possibly bad user/password or https login don\' work behind a proxy.'

        if len(proxies) == 0:
            proxy = None
        else:
            proxy = tuple(proxies['http'].split(':'))
            proxy = (proxy[0], int(proxy[1]))
        self.__proxy = proxy

    def set_sleep_secs(self, secs):
        self.__sandman.set_sleep_secs(secs)

    def set_sleep_module(self, iterations):
        self.__sandman.set_sleep_module(iterations)

    def set_sleep_failure(self, secs):
        self.__sandman.set_sleep_failure(secs)

    def set_sleep_random_flag(self, bool):
        self.__sandman.set_sleep_random_flag(bool)

    def self_email(self, email, name):
        if name.lower().startswith(email.split('@')[0].lower()):
            return True
        if len(name.split(' ')) == 1 and name.lower() == email.split(
                '@')[0].lower():
            return True
        if len(name.split(' ')) == 2 and '.'.join(
                name.split(' ')).lower() == email.split('@')[0].lower():
            return True
        if len(name.split(' ')) == 2 and '_'.join(
                name.split(' ')).lower() == email.split('@')[0].lower():
            return True
        if len(name.split(' ')) == 2 and name.split(
                ' ')[0].lower() == email.split('@')[0].lower():
            return True
        if len(name.split(' ')) == 2 and (
                name.split(' ')[0][0] +
                name.split(' ')[1]).lower() == email.split('@')[0].lower():
            return True
        return False

    def name_to_emails(self, (aliases, graph)):
        self.__harvest_command.set_only_complete_names(False)
        return self.__name_to_emails(aliases, 'all_mails')
Exemple #8
0
        emails_with_name[e] = n
        name_with_email[n] = e
print "--------------------------------------------------------------------------------"
print "emails with name"
print str(emails_with_name)


def good_mail(mail):
    s = mail.split("@")
    return len(s) > 1 and "." in s[1] and not ".." in s[1]


# now we retrieve non-domain emails.
mails_per_name = {}
mails_hits = {}
ngd = NGD()
for e, n in emails_with_name.iteritems():
    mails_per_name[n] = []
    emails = search_email_address_grabber()
    emails.initialize(n, "name2email&domain")
    emails.targetRun()
    email_list = emails.finalize()
    for e2, n2 in email_list:
        if e2 != e and good_mail(e2):
            mails_per_name[n].append(e2)
            mails_hits[e2] = ngd.results(e2)

for n, l in mails_per_name.iteritems():
    print n
    print str(l)
    print "--------"