Beispiel #1
0
def main():
    arg_parser = argparse.ArgumentParser(description='APK크롤러 실행법')
    arg_parser.add_argument('--method',
                            help='실행시키고자 하는 기능(crawl_new,\
        crawl_old, update_apk)')
    arg_parser.add_argument('--desktop', type=str2bool, default=True,\
        help='Desktop으로 실행시키려면 true, Server로 실행시키려면 false')

    args = arg_parser.parse_args()

    if (args.desktop != None):
        desktop = args.desktop

    if (args.method != None):
        method = args.method

    playstore_crawler = Crawler(is_desktop=desktop)

    if (method == "crawl_new"):
        playstore_crawler.crawl_new()
    elif (method == "crawl_old"):
        playstore_crawler.crawl_old()
    elif (method == "update_apk"):
        playstore_crawler.update_apk()

    playstore_crawler.close()
def main():
    arg_parser = argparse.ArgumentParser(
      description='Mobile App Crawler Manual',
      formatter_class=RawTextHelpFormatter)
    arg_parser.add_argument('-m', '--method', 
      help=('crawl_new: Scrap PlayStore top 300 app information '
            'for each category\n'
            'crawl_old: Update collected app information\n'
            'update_apk: Download APK file'))
    arg_parser.add_argument('-d', '--desktop', 
      type=str2bool, default=True,
      help=('True(Default): Show web browser (use selenium)\n'
            'False: Do not show web browser (use virtual screen)'))

    args = arg_parser.parse_args()

    if(args.desktop != None):
        desktop = args.desktop

    if(args.method != None):
        method = args.method

    playstore_crawler = Crawler(is_desktop = desktop)

    if(method == "crawl_new"):
        playstore_crawler.crawl_new()
    elif(method == "crawl_old"):
        playstore_crawler.crawl_old()
    elif(method == "update_apk"):
        playstore_crawler.update_apk()

    playstore_crawler.close()
Beispiel #3
0
def main():
    stocknum = str(600000)
    total = dict()
    for i in range(1, 10):
        page = str(i)
        crawler = Crawler(stocknum, page)
        datalist = crawler.getData()
        comments = File(stocknum + '_page_' + page, 'json', './data/')
        comments.inputData(datalist)
        data = open('./data/' + stocknum + '_page_' + page + '.json',
                    'r').read()
        jsonData = json.loads(data)
        for detail in jsonData:
            num = '1' if '年' not in detail['age'].encode(
                'utf-8') else detail['age'].encode('utf-8').replace('年', '')
            num = float(num)
            date = detail['time'][4:14].encode('utf-8')
            total[date] = total[date] if date in total.keys() else {
                'num': 0,
                'content': 0
            }
            total[date]['num'] = total[date]['num'] + num if total[date][
                'num'] else num
            total[date]['content'] = total[date][
                'content'] + detail['content'] * num if total[date][
                    'content'] else detail['content'] * num
    total = json.dumps(total)
    totalfile = File(stocknum, 'json', './data/')
    totalfile.inputData(total)
Beispiel #4
0
    def get(self):
        cr = Crawler()

        url = "http://loaq.kr/"
        json_file = cr.start(url)

        return json_file
Beispiel #5
0
def getContent(session, object, title):
    error_data = OrderedDict()
    return_data = OrderedDict()

    highkorea = object
    s = session
    topicurl = title['titleURL']
    topictitle = title['title']
    topicauthor = title['author']
    lastup = title['lastup']
    try:
        tup = highkorea.staticGet(s, highkorea.stem + topicurl.strip('.'))
        s, html, soup = tup[0], tup[1].text, tup[2]
        lastpg = soup.select('div.topic-actions div span a')
        lastpage = lastpg[-1].get('href') if lastpg != [] else soup.select(
            '#page-body h2 a')[0].get('href')
        m = re.compile('&start=(\d+)').search(lastpage)
        num = 1
        if m:
            num = int(m.group(1))
            url = re.sub('&start=(\d+)', '', lastpage)
        else:
            url = lastpage
        content_datas = list()
        html_datas = list()
        image_datas = list()
        for i in range(0, num, 10):
            tup = highkorea.staticGet(
                s, highkorea.stem + url.strip('.') + '&start={}'.format(i))
            s, html, soup = tup[0], tup[1].text, tup[2]
            authors = soup.select(".author strong a.username-coloured")
            contents = soup.find_all('div', {'class': 'content'})
            images = soup.select('dl.attachbox dd dl dt a img')
            for author, content in zip(authors, contents):
                content_data = OrderedDict()
                for br in content.find_all('br'):
                    br.replace_with('\n')
                content_data['author'] = author.text
                content_data['content'] = content.text
                content_datas.append(content_data)
            for image in images:
                image_data = OrderedDict()
                image_data['src'] = image.get('src')
                image_data['name'] = image.get('alt')
                image_datas.append(image_data)
            html_datas.append(html)
        return_data['image'] = image_datas
        return_data['html'] = html_datas
        return_data['content'] = content_datas
        return_data['url'] = topicurl
        return_data['title'] = topictitle
        return_data['author'] = topicauthor
        return_data['lastup'] = lastup
        # return_data[topicurl] = return_data
        return return_data
    except Exception as e:
        error_data[title['titleURL']] = e
        cr.mkjson(error_data, '/home/kyw/json_datas/highkorea',
                  'hkContent_error.json')
        pass
Beispiel #6
0
def main():
	#instancia do coletor
	diretorio = sys.argv[1]
	projeto = sys.argv[2]
	createDir(projeto)
	#coletor = ColetorFT(projeto)
	coletor = Crawler(diretorio,projeto)
	coletor.start()
Beispiel #7
0
def main():
    first_url = raw_input(u"""请输入首地址:""")
    deepth = int(raw_input(u"""请输入层数:"""))
    examlpe_url = raw_input(u"""请输入示例下载链接""")

    crawler_model = Crawler(first_url,deepth,examlpe_url)
    crawler_model.start()
    for items in crawler_model.final_url_list:
        print items + '\n'
Beispiel #8
0
def getGamesInCertainRange(date_range, game_dict, data_collection, owner):
    start_date = time.strptime(date_range.split("-")[0], "%Y/%m/%d")
    end_date = time.strptime(date_range.split("-")[1], "%Y/%m/%d")
    res = {}
    rep = {}  #contain games that do not have enough owner
    print("start searching for the existing data...")
    raw_data_list = data_collection.find()
    for data in raw_data_list:
        if "release_date" in data and data["release_date"] != "Not released":
            release_date = time.strptime(
                str(data["release_date"][0]) + "/" +
                str(data["release_date"][1]) + "/" +
                str(data["release_date"][2]), "%Y/%m/%d")
            if time.mktime(release_date) >= time.mktime(
                    start_date) and time.mktime(release_date) <= time.mktime(
                        end_date):
                print("Found desired game!")
                app_data = Crawler.checkSteamSpy(data["appid"])
                if app_data is None or app_data["owners"] < int(owner):
                    print("Owners of game id " + str(data["appid"]) +
                          " is not enough! Deleting...")
                    rep[data["appid"]] = data["name"]
                    continue
                res[data["appid"]] = data["name"]
    print("start checking newly added games")
    for k in game_dict.keys():
        if k in res or k in rep:
            #data already existed
            continue
        result = data_collection.find_one({"appid": k})
        if data_collection.count(
            {"appid": k}) != 0 and "release_date" in result and result[
                "release_date"] != "Not released":
            release_date = time.strptime(
                str(result["release_date"][0]) + "/" +
                str(result["release_date"][1]) + "/" +
                str(result["release_date"][2]), "%Y/%m/%d")
        else:
            #Not in raw database
            bsObj = Crawler.connectToSteamSpy(k)
            release_date = Crawler.getReleaseDateFromSteamSpy(bsObj)
            if release_date == "Not released":
                print("Appid: " + str(k) +
                      " has not been recorded on SteamSpy")
                continue
        if time.mktime(release_date) >= time.mktime(
                start_date) and time.mktime(release_date) <= time.mktime(
                    end_date):
            print("Found desired game!")
            app_data = Crawler.checkSteamSpy(k)
            if app_data is None or app_data["owners"] < int(owner):
                print("Owners of game id " + str(k) +
                      " is not enough! Deleting...")
                continue
            res[k] = game_dict[k]
    return res
Beispiel #9
0
def GetDumpsOfCrawler():
    pagData = Crawler.crawl('https://www.cnj.jus.br/sgt/versoes.php')
    soups = Crawler.getSoup(pagData)
    linkOfDumps = Crawler.GetUrlOfDumps(soups)
    dataDump = []
    for dump in linkOfDumps:
        data = Crawler.crawl(dump)
        dataSoup = Crawler.getSoup(data)
        if re.search("\d{1,2}\_dump_estrutura\.sql", dump):
            ExecuteQueryEstrutura(dataSoup.text)
        else:
            ExecuteQueryDados(dataSoup.text)
Beispiel #10
0
def task():
    while True:
        db_path = '/root/mysite/info.db'
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute('select openid,account,passwd from user where is_valid="True"')
        value = cursor.fetchall()
        for each in value:
            Crawler.invoke(each[0], each[1], each[2])
            time.sleep(10)
        with open('/root/mysite/task_log.txt','a') as f:
            f.write(str(time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time())))+':finish\r\n')
        time.sleep(2*60 * 60)
Beispiel #11
0
def getTitles(session, object, forumurl):
    error_data = list()
    data = list()
    highkorea = object
    s = session
    m = re.compile('&start=(\d+)').search(forumurl)
    num = 1
    datPat = re.compile(
        '\(.\) (?P<month>\d{2}) (?P<day>\d{2}), (?P<year>\d{4}) (?P<hour>\d{1,2}):(?P<minute>\d{2}) (?P<noon>[a-z]{2})'
    )
    if m:
        num = int(m.group(1))
        url = re.sub('&start=(\d+)', '', forumurl)
    else:
        url = forumurl
    for i in range(0, num, 25):
        tup = highkorea.staticGet(
            s, highkorea.stem + url.strip('.') + '&start={}'.format(i))
        s, soup = tup[0], tup[2]
        titles = soup.find_all("a", {"class": "topictitle"})
        dates = soup.find_all("dd", {"class": "lastpost"})
        authors = soup.select('li.row dl dt a.username-coloured')
        year, month, day, hour, minute, noon, unixtime = '', '', '', '', '', '', 0
        for date, tempTitle, tempAuthor in zip([date.text for date in dates],
                                               titles, authors):
            article = OrderedDict()
            m = datPat.search(date)
            try:
                titleURL = tempTitle.get('href')
                title = tempTitle.text
                author = tempAuthor.text
                if m:
                    month, day, year, hour, minute, noon = m.group(
                        "month"), m.group("day"), m.group("year"), m.group(
                            "hour"), m.group("minute"), m.group("noon")
                    hour = int(hour) + 12 if noon == 'pm' else int(hour)
                    d = datetime.datetime(int(year), int(month), int(day),
                                          hour, int(minute))
                    unixtime = int(time.mktime(d.timetuple()))
                else:
                    error_data.append(titleURL)
                article['titleURL'] = titleURL
                article['title'] = title
                article['author'] = author
                article['lastup'] = unixtime
                data.append(article)
            except:
                error_data.append(tempTitle.get('href'))
    cr.mkjson(error_data, '/home/kyw/json_datas/highkorea',
              'hkTitle_error.json')
    return data
Beispiel #12
0
 def __init__(self, bot, msg):
     self.bot = bot
     self.msg = msg
     #self datas chat
     self.chat_id = str(msg['chat']['id'])
     self.chat_type = msg['chat']['type']
     try:
         self.chat_title = str(msg['chat']['title'])
     except:
         self.chat_title = ''
     #self datas messages
     self.msgt = msg.get('text')
     self.msg_id = msg['message_id']
     self.msg_s = msg['text'].split(' ')
     #self datas users
     self.uid = msg['from']['id']
     self.nick = msg['from']['username']
     self.group_id = str(-1001166468779)
     #instancias
     self.scan = Scanners.Scans(msg)
     self.log = PyMyAdmin.LogManager(self.msg, self.uid, self.nick,
                                     self.chat_id, self.chat_title)
     self.crawler = Crawler.Crawlers(self.msg)
     self.generators = Generators.generators(self.msg)
     self.data = PyMyAdmin.Database('', '', '').get_statistic()
Beispiel #13
0
def getPageMorts(url, pageFilter):
    crawl = Crawler(url, pageFilter=pageFilter)
    urlsMort = []
    for page in crawl:
        print("HttpCode:%d     Url: %s " % (page.codeHTTP, page.url))

        if page.codeHTTP not in range(200, 300):
            urlsMort.append((page.codeHTTP, page.url))
        # a new dictionary to stock the results {url who has the dead liens: [dead liens]}
        pageParents = {}
        for url in urlsMort:
            for pageParent in crawl.pagesToCrawl_dict[url[1]]:
                if pageParent in pageParents:
                    if url[0] in pageParents[pageParent]:
                        pageParents[pageParent][url[0]].append(url[1])
                    else:
                        pageParents[pageParent][url[0]] = [url[1]]
                else:
                    pageParents[pageParent] = {url[0]: [url[1]]}

    print "\n Crawler Complet!\n"
    with open('liensMort.txt', 'w') as dump_file:
        for pageParent in pageParents:
            dump_file.write('Dans la page : \n{}\n'.format(pageParent))
            dump_file.write('\n')
            codeHTTP = pageParents[pageParent].keys()
            codeHTTP.sort()
            for code in codeHTTP:
                dump_file.write('HTTP return code {}\n'.format(code))
                for url in pageParents[pageParent][code]:
                    dump_file.write('        {}\n'.format(url))
            dump_file.write('*' * 80 + '\n\n')
Beispiel #14
0
    def level_chosen(self, event):
        level = self.level_choice.GetStringSelection()  # Get level chosen
        self.subject_choice.Clear()
        self.paper_checklist.Clear()
        self.year_choice.Clear()
        self.season_choice.Clear()
        self.num_choice.Clear()
        self.region_choice.Clear()
        self.pairs_info = {}
        self.files_info = {}

        if level == self.level_list[0]:  # Not choosing a level
            return

        # Cache
        global cache_folder
        cache_subject = os.path.join(cache_folder, "GCE Guide %s" % level)
        if not os.path.exists(cache_subject):
            self.subject_dict = Crawler.visit_level(
                Crawler.levels_dict[level])  # Return subject list
            if self.subject_dict == -1:  # Connection error
                wx.MessageBox(
                    "Please check your Internet connection and retry.",
                    "Connection Error")
                self.level_choice.SetSelection(0)
                return
            else:
                Cache.store(self.subject_dict, cache_subject)
        else:
            self.subject_dict = Cache.load(cache_subject)

        subject_list = ["----- Select subject -----"
                        ] + [each for each in self.subject_dict]
        self.subject_choice.Set(subject_list)  # Update subject list
        self.subject_choice.SetSelection(0)
def storeUrl4Comp():
    db = pymysql.connect("localhost",
                         "root",
                         "root",
                         "kg4company_2",
                         charset="utf8")
    cur = db.cursor()
    sql1 = u"select name from city;"
    reCount = cur.execute(sql1)  # 返回受影响的行数
    cities1 = cur.fetchall()
    sql2 = u"select name from industry;"
    cur.execute(sql2)
    cities2 = cur.fetchall()
    cities = cities1 + cities2
    sql3 = u"insert into urlList(url4comp) values('{name}');"
    cities = cities[71:]
    for city in cities:
        print city[0]
        for idx in range(1, 31):
            print('page: ' + str(idx))
            cwler = cw.Crawler()
            cwler.getCompUrl(idx, city[0])
            list_ = cwler.listComp
            if len(list_) == 0:
                break
            for li in list_:
                print li
                sqlN = sql3.format(name=li)
                cur.execute(sqlN)
                db.commit()
            time.sleep(2)
            print('page: ' + str(idx) + ' is done')
        time.sleep(2)
        print(city[0] + ' is done')
    def level_chosen(self, event):
        self.init_choice()
        self.subject_choice.Clear()
        if self.level_choice.GetSelection() == 0:
            return
        else:
            level = self.level_choice.GetStringSelection()  # Get level chosen

        # Cache
        cache_folder = Cache.customized_directory()
        cache_subject = os.path.join(cache_folder, "GCE Guide %s" % level)
        if not os.path.exists(cache_subject):
            self.subject_dict = Crawler.visit_level(
                Crawler.levels_dict[level])  # Return subject list
            if self.subject_dict == -1:  # Connection error
                wx.MessageBox(
                    "Please check your Internet connection and retry.",
                    "Connection Error")
                self.level_choice.SetSelection(0)
                return
            else:
                Cache.store(self.subject_dict, cache_subject)
        else:
            self.subject_dict = Cache.load(cache_subject)

        subject_list = ["----- Select subject -----"
                        ] + [each for each in self.subject_dict]
        self.subject_choice.Set(subject_list)  # Update subject list
        self.subject_choice.SetSelection(0)
    def findSimilar(self,link,limit):

        # we call the read text function from the Crawler to read the new link
        # we use the construcotr with empty variables
        crawler = Crawler.Crawler('',0,0,0)


        self.limit = limit
        file = open("Data/page%d.txt" % self.limit, 'w')

        try:
            self.title , text = crawler.getText(link)
            # we combine the lists of string to a single string
            text = ''.join(text)
            for t in text:
                file.write(t)
            file.close()
        except:
            "Link is not accesible"
            file.close()
            sys.exit(0)

        indexer = Indexer.Indexer()
        indexer.start()

        cosineSimilarity = indexer.getCosineSimilarity()



        linksId = [ i for i in range(self.limit)]

        linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)]

        return cosineSimilarity , linksIdSorted
Beispiel #18
0
def upload_file():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        # if user does not select file, browser also
        # submit an empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            redirect(url_for('upload_file', filename=filename))

            try:

                crawMobi = Crawler.crawler()
                crawMobi.menu()

                return render_template('download.html')

            except Exception as e:
                return render_template('error.html')

    return render_template('home.html')
Beispiel #19
0
def getTitles(session, object, forumurl):
    error_data = list()
    data = list()
    zion = object
    s = session
    m = re.compile('&start=(\d+)').search(forumurl)
    num = 1
    if m:
        num = int(m.group(1))
        url = re.sub('&start=(\d+)', '', forumurl)
    else:
        url = forumurl
    for i in range(0, num, 25):
        tup = zion.staticGet(
            s, zion.stem + url.strip('.') + '&start={}'.format(i))

        s, soup = tup[0], tup[2]
        if i == 0:
            forums = soup.find_all("a", {"class": "forumtitle"})
            if forums != []:
                forumURLs = [forum.get('href') for forum in forums]
                forumtitles = [forum.text for forum in forums]
                tup = getLastPage(s, zion, forumtitles, forumURLs)
                for x in tup[1].values():
                    internal_data = getTitles(s, zion, x)
                    data.extend(internal_data)
        titles = soup.find_all("a", {"class": "topictitle"})
        authors = soup.find_all("a", {"class": "username"}) if soup.find_all(
            "a", {"class": "username"}) != [] else soup.find_all(
                "a", {"class": "username-coloured"})

        for tempTitle, tempAuthor in zip(titles, authors):
            article = OrderedDict()
            try:
                titleURL = tempTitle.get('href')
                title = tempTitle.text
                author = tempAuthor.text
                article['titleURL'] = titleURL
                article['title'] = title
                article['author'] = author
                data.append(article)
            except:
                error_data.append(tempTitle.get('href'))
    if data == []:
        cr.mktxt(forumurl + '\n', '/home/kyw/json_datas/zion',
                 'zion_noTitle.txt')
    return data
Beispiel #20
0
 def __init__(self, busca, distancia=5):
     self.ponto_inicial = 'http://www.rugbyfluminense.com.br/'
     self.distancia = 2
     self.crawler = Crawler.Crawler()
     self.criar_no = Criar_No.Criar_No
     self.catalogo = []
     self.busca = busca  #### termo buscado
     self.grafo = Grafo.Grafo()
Beispiel #21
0
def GetPage(url):
    while True:
        p = GetProxy()
        page = Crawler.CrawlSinglePage(url, proxy=p)
        if len(page) > 10:
            break
        InvalidProxy(p)
    return page
Beispiel #22
0
 def run(self):
     """
     Start processing.
     """
     # parse the command line arguments and set logging options
     try:
         self.args = self.parser.parse_args()
         self.configureLogging()
         self.logger.info("Started with {0}".format(' '.join(sys.argv[1:])))
     except Exception as e:
         self.parser.print_help()
         sys.exit(e)
     # load the configuration file
     try:
         with open(self.args.config) as f:
             self.config.readfp(f)
     except Exception as e:
         self.logger.critical("Could not load the specified configuration file")
         sys.exit(e)
     # set options
     Cfg.LOG_EXC_INFO = self.args.trace
     # execute commands
     with Timer.Timer() as t:
         if self.args.crawl:
             import Crawler
             Crawler.crawl(self.config, self.args.update)
         if self.args.clean:
             import Cleaner
             Cleaner.clean(self.config, self.args.update)
         if self.args.infer:
             import Facter
             Facter.infer(self.config, self.args.update)
         if self.args.graph:
             import Grapher
             Grapher.graph(self.config, self.args.update)
         if self.args.transform:
             import Transformer
             Transformer.transform(self.config)
         if self.args.post:
             import Poster
             Poster.post(self.config)
         if self.args.analyze:
             import Analyzer
             Analyzer.analyze(self.config, self.args.update)
     self.logger.info("Indexer finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))
Beispiel #23
0
    def __init__(self, __textUrlData, __pageRankData):
        # instantiates data / service classes
        self.__textData = __textUrlData;
        self.__pageRankData = __pageRankData;
        self.__webscraper = WebScrape(self.__textData, self.__pageRankData);
        self.__crawler = Crawler(self.__textData);

        # loads all data structures by calling lower tier helper classes
        self.__generate_data_structures();
Beispiel #24
0
def init(username,password,sleepTime = 600,configList = None):
    fileOld = 'old'+username+'.txt'
    fileNew = 'new'+username+'.txt'
    if (configList):
        mailto_list = str(configList[3].strip('\n'))
        mail_host = str(configList[4].strip('\n'))
        mail_user = str(configList[5].strip('\n'))
        mail_pass = str(configList[6].strip('\n'))
        mail_postfix = str(configList[7].strip('\n'))
    else:
        mailto_list = raw_input("to email address, [email protected]:")
        mail_host = raw_input("smtp server,  smtp.qq.com:")
        mail_user = raw_input("mailUser,  [email protected]:")
        mail_pass = raw_input("mailPassword:"******"mailPostfix, qq.com:")
    while(True):
        try:
            Crawler.init(username,password)
        except Exception as e:
            print (e)
            continue
        new = open(fileNew, 'U').readlines()
        try:
            old = open(fileOld, 'r').readlines()
        except:
            old = open(fileOld, 'w+').readlines()
        diff = difflib.ndiff(old,new)

        if (not (filecmp.cmp(fileOld,fileNew))):
            print "not same"
            content = 'studentID= ' + username + ' \n'
            for i in diff:
                content += i
            print content
            mail = sendMail(content,mailto_list,mail_host,mail_user,mail_pass,mail_postfix)
            print "new grades"
            new = open(fileNew,'r')
            old = open(fileOld,'w')
            old.writelines(new.readlines())
        else:
            print "same"
        print "sleeping...@",
        print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
        time.sleep(sleepTime)
def get_link():
    with open('/home/kyw/agoraHTMLnumber/agoraHTMLnumber.txt', 'a') as wf:
        agora = 'http://c2djzrn6qx6kupkn.onion/'
        with requests.Session() as s:
            agora = cr.Site(agora)
            tup = agora.staticGet(s, agora.stem)
            s, html = tup[0], tup[1]
            links = re.compile('href="(\d+\.html)').findall(html.text)
            wf.write(links[-1].strip('.html') + '\n')
    return links[-1]
def parse_home():
    try:
        indice_link = 0
        for url in HOME_URL:
            instancia_crawler_main = Crawler.Crawler(url,
                                                     NUEVO_LINK[indice_link])
            instancia_crawler_main.crawler_main(indice_link)
            indice_link += 1
    except ValueError as ve:
        print(ve)
Beispiel #27
0
class SearchEngine:

  def __init__(self):
    indexer      = Indexer()
    self.graph   = Graph()
    self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d06.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d08.html"},
                            self.graph, indexer)
    self.crawler.crawl()
    self.scorer = Scorer(indexer.index, indexer.documents)

    self.pageRank = PageRank(self.graph)
    self.pageRank.calc()


  def search (self, string, scoreOnly = False):
    query  = string.split()
    scores = self.scorer.scoreQuery(query)

    if scoreOnly:
      results = scores
    else:
      results = {}
      for url, score in scores.items():
        results[url] = score * self.graph.get_document(url).rank

    sortedResults = sorted(results.items(), key=operator.itemgetter(1), reverse = True)
    for res in sortedResults:
      print(res)

  def printPageRanks(self):
    print('Page ranks:')
    print('  d01  -   d02  -   d03  -   d04  -   d05  -   d06  -   d07  -   d08')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d01.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d02.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d03.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d04.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d05.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d06.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d07.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d08.html").rank, 4), end = '\n\n')
Beispiel #28
0
def to_url(request):
    s = request.GET.get('text')
    s = s.split(" ")
    r = []
    for i in s:
        if i != " " and i !="":
            r.append(i)
    a= Crawler.Crawler(0, 1, r, '/home/katrin/databasetemp/')
    try:
        a.downloadPages()
    except Exception, e:
        pass
Beispiel #29
0
  def __init__(self):
    indexer      = Indexer()
    self.graph   = Graph()
    self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d06.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d08.html"},
                            self.graph, indexer)
    self.crawler.crawl()
    self.scorer = Scorer(indexer.index, indexer.documents)

    self.pageRank = PageRank(self.graph)
    self.pageRank.calc()
Beispiel #30
0
def fetch_url(url):
    # print "-> %r requesting" % (url)
    url = url.rstrip('\n')
    try:
        response = Crawler.data_request(url)
        # print "-> %r request complete" % (url)
        if (response is not None) and (response not in error_list):
            return url, response, None
        else:
            return url, None, response
    except Exception as e:
        return url, None, e
Beispiel #31
0
def test():
    c = Crawler("http://www.artsandscience.utoronto.ca/ofr/timetable/winter/csc.html")
    c.extract_course_code()
    c.course_init()
    file = FileSystem("course.txt", c.get_courses())
    file.writing()
    print "finished"
Beispiel #32
0
def findword():
    #Verifica se os perâmetros foram enviados
    if (request.args.get('urls', default=[], type=str)
            and request.args.get('termo', default="error", type=str)):
        #Pega a string com as urls e trata para se tornar uma lista de urls
        array = request.args.get('urls', default=[], type=str)
        #Tratamento contra erros básicos
        array = array.replace("[", "")
        array = array.replace("]", "")
        array = array.replace("'", "")
        array = array.replace('"', "")
        array = array.split(",")
        #verifica se contém . na url e se contem o protocolo http ou https
        for i in range(len(array)):
            #se não tiver ponto dá erro
            if not (array[i].find('.') > -1):
                return root()
            #se não contém adiciona ao começo da string
            if not (array[i].find('http') > -1):
                array[i] = 'http://' + (array[i].strip())

    #Atribui o termo
        word = request.args.get('termo', default="", type=str)
        #verifica se não é vazio
        if (not word.strip()):
            return root()
    #verifica e atribui o parâmetro opcional ignorecache
        ignorecache = request.args.get('ignorecache',
                                       default='False',
                                       type=str)
        if (ignorecache == 'False' or ignorecache == 'false'
                or ignorecache == '0' or ignorecache == ''):
            return jsonify(Crawler.spider(array, word, len(array), False))
        #Qualquer caso que não seja parametro inexistente ou igual a false passa True
        return jsonify(Crawler.spider(array, word, len(array), True))
    #Caso os parametros não sejam passados retorna a página com referencia de parametros
    return root()
    print("\n" * 5)
    print("TEMPO API ASYNC")
    print("--- %s seconds ---" % (time.time() - start_time))
Beispiel #33
0
def crawl_urls(random_url_list, web_df, keyword_df, i, base_dict,
               save_after_itr, url_jump_lag, num_retries, wait_range):
    status_flag = ''
    random_urls_sub = random_url_list[i:i + save_after_itr]

    html_dict = venom.download_randomly(random_urls_sub,
                                        url_jump_lag=url_jump_lag,
                                        num_retries=num_retries,
                                        wait_range=wait_range)
    list_of_dicts = []

    for domain_url, domain_data in html_dict.items():
        # link_tag, link_class, text_tag, text_class = '', '', '', ''
        html = list(domain_data.values())[0]
        domain_name = list(domain_data.keys())[0]
        href = ''
        visible_text = ''
        status_flag = ''

        if html is not None:
            href, visible_text = get_data_from_html(web_df, html, domain_name,
                                                    base_dict)

        else:
            print('Web page not scrapped:', domain_url)
            status_flag = 'Access Denied'

        list_of_dicts.append({
            "Domain Name": domain_name,
            "URL": domain_url,
            "Product Link": href,
            "Product Description": visible_text,
            'Status Flag': status_flag
        })

    crawled_df = pd.DataFrame(list_of_dicts)

    if crawled_df.empty or (crawled_df is None):
        # Create empty dataframe
        crawled_df = pd.DataFrame(columns=[
            'Domain Name', 'Product Description', 'Product Link', 'URL',
            'MATERIAL_NO', 'MANUFACTURER_NAME_1', 'MANUFACTURER_PT_NO_1',
            'NOUN_ENGLISH', 'MODIFIER_ENGLISH', 'Keywords', 'Status Flag'
        ])
    else:
        crawled_df['Product Description'].fillna('', inplace=True)
        crawled_df['Product Link'].fillna('', inplace=True)

        crawled_df = crawled_df.merge(keyword_df, on='URL', how='left')
        crawled_df.drop('Search_URL', axis=1, inplace=True)

    return crawled_df
Beispiel #34
0
    def ProcessTask(self, task):
        crawlerParams = {
            'board': task['board'],
            'blFromJson': True,
            'start': -(task['searchPages']),
            'end': -1
        }

        crawler = Crawler.PttWebCrawler(crawlerParams)
        articles = crawler.parse_articles()

        if self.LOW_PRICE_TASK == task['type']:
            self.HandleLowPriceTask(articles, task)
Beispiel #35
0
def doSongciCrawler():
    content = urllib2.urlopen(Crawler.songurl).read()
    #print content
    parser = Crawler.MyParser()
    parser.feed(content)
    print len(parser.linkList)
    print len(parser.linkDescList)

    linkDescList = parser.linkDescList
    linkList = parser.linkList
    #ind = 0
    savedCnt = 0
    failedCnt = 0
    #len(linkList)
    for ind in range(len(linkList)):
        item = linkList[ind]
        print Crawler.base_url + item
        guwenPage = urllib2.urlopen(Crawler.base_url + item).read()
        newParser = Crawler.MyParser()
        newParser.feed(guwenPage)
        guwenContNow = newParser.songciCont

        res = Crawler.songciContHandle(guwenContNow)
        for item in res:
            if len(item) > 0:
                try:
                    songci = Songci()
                    songci.set('allStr', ' '.join(item))
                    songci.set('category', linkDescList[ind].strip())
                    songci.set('name', item[0].strip())
                    songci.set('author', item[1].strip())
                    songci.set('content', item[2].strip())
                    songci.save()
                    savedCnt += 1
                except Exception, e:
                    failedCnt += 1
                    #if str(e).find('LeanCloudError: [137] A unique field was given a value that is already taken.') is -1:
                    print e
                    print ' '.join(item), 'saving failed'
Beispiel #36
0
def getText(url):
    #访问链接并获得新闻内容
    html = Crawler.askUrl(url)
    if html == None:
        return None
    soup = BeautifulSoup(html, "html.parser")
    contents = soup.find_all('span', {'class': 'bjh-p'})
    if len(contents) == 0:
        contents = soup.find_all('p')
    text = ''
    for i in range(0, len(contents)):
        text += contents[i].text
    return text
def main():
	print("Starting our Web Crawler")
	baseUrl = input("Website > ")
	numberOfThreads = input("No Threads > ")

	linksToCrawl = queue.Queue()
	urlLock = threading.Lock()
	linksToCrawl.put(baseUrl)
	haveVisited = []
	crawlers = []
	errorLinks = []

	with open("links.txt", "w+") as f:
		for i in range(int(numberOfThreads)):
			crawler = Crawler(baseUrl, linksToCrawl, haveVisited, errorLinks, urlLock, f)
			crawler.run()
			crawlers.append(crawler)
		
		for crawler in crawlers:
			crawler.join()

	print("Total Number of Pages Visited {}".format(len(haveVisited)))
	print("Total Number of Pages with Errors {}".format(len(errorLinks)))
def agoraMultiCrawler(new_html):

    with requests.Session() as s:

        return_data = OrderedDict()
        agora = 'http://c2djzrn6qx6kupkn.onion/'
        agora = cr.Site(agora)

        # if i == 0:
        #     tup = agora.staticGet(s, agora.stem)
        #     s, html, soup = tup[0], tup[1].text, tup[2]
        tup = agora.staticGet(s, agora.stem + "/{}.html".format(new_html))
        s, html, soup = tup[0], tup[1].text, tup[2]
        messages = soup.find_all("div", {"class": "message"})
        labels = soup.find_all("label")

        ids = soup.find_all("span", {"class": "reflink"})

        content_data = list()

        for id, label, message in zip(ids, labels, messages):
            temp_data = OrderedDict()
            posterman = label.find("span", {
                "class": "postername"
            }).get_text().encode(
                'iso-8859-1').decode('utf-8').strip('\n') if label.find(
                    "span", {"class": "postername"}) is not None else None
            filetitle = label.find("span", {
                "class": "filetitle"
            }).get_text().encode(
                'iso-8859-1').decode('utf-8').strip('\n') if label.find(
                    "span", {"class": "filetitle"}) is not None else None
            for lab in label("span"):
                lab.decompose()
            mid = id.find_all('a')[-1].get_text()
            date = label.get_text().encode('iso-8859-1').decode('utf-8').strip(
                '\n').strip('  ')
            ms = message.get_text().encode('iso-8859-1').decode('utf-8').strip(
                '\n')
            temp_data['author'] = posterman
            temp_data['title'] = filetitle
            temp_data['id'] = mid
            temp_data['date'] = date
            temp_data['message'] = ms
            content_data.append(temp_data)
        return_data['html'] = html
        return_data['content'] = content_data
        return_data['url'] = tup[1].url
        return return_data
    parser.set_defaults(map_save_to_dot=MAP_SAVE_TO_DOT)

    args = parser.parse_args()

    print("Running configuration:")
    for attr, value in sorted(args.__dict__.iteritems()):
        print("\t{0} = {1}".format(attr, value))

    c = Crawler(args.language,
                args.currency,
                args.user_place,
                args.departure_point,
                args.ignored_points,
                args.departure_month,
                args.departure_year,
                args.price_limit,
                args.flights_limit,
                args.selenium_host,
                args.selenium_port,
                args.selenium_start_cmd,
                args.selenium_load_timeout,
                args.map_save_to_dot,
                args.map_dot_filename,
                args.output_encoding)

    try:
        c.create_map()
        c.analyze_map()
    except KeyboardInterrupt:
        print("Ctrl-C pressed...")
    except SeleniumError as err:
        print(str(err))
Beispiel #40
0
def handleRule(rule):
	ic = ItemCrawler()
	db_avg_cost = DB.get_avg_price(rule.link)
	
	Log("{} for less then {} Chaos".format(rule.name, (PRICE_MULTIPLICATOR*db_avg_cost)),"CURRENT SCAN", "blue", 0)
	
	hits = ic.hits(rule.link)
	
	if not rule.enabled:
		return
	
	isFirst = True
	hitCnt = 0
	hitSum = 0
	for hit in hits:
		
		cost = Crawler.getCostFromEntry(hit)
		ign  = Crawler.getIGNFromEntry(hit)
		(corrupted, item_name) = Crawler.getItemNameFromEntry(hit)
		
		item_print = item_name 
		
		if not item_name:
			item_name = "<NOT FOUND>"
		else:
			if corrupted:
				item_print = Fore.RED + "Corrupted" + Fore.RESET + " " + item_name
				
		chaos_cost = Currency.getChaosValue(cost)
		
		# Solange noch kein Item gefunden wurde ist db_avg_cost = 0 und db_avg_cost * PRICE_IGNORE_MIN ebenfalls. 
		# Das gleiche gilt für PRICE_IGNORE_MAX
		if not db_avg_cost:
			db_avg_cost = chaos_cost
			
		if chaos_cost < db_avg_cost * PRICE_IGNORE_MIN or chaos_cost > db_avg_cost * PRICE_IGNORE_MAX:
			Log("Someone is selling {} ({}) for {} chaos!!".format(item_print, rule.link, chaos_cost), 
				"TROLL ALERT", "red", 0)
			continue
		
		if(hitCnt < MAX_HITS_FOR_AVG):
			hitSum += chaos_cost
			hitCnt += 1
			
		
		# if chaos_cost <= rule.price and isFirst:
		if chaos_cost <= (PRICE_MULTIPLICATOR*db_avg_cost):
			Log("Schnappaaah : ", "INFO", "yellow", 0)
			Log(rule.name, "RULE", None, 1)
			Log(item_print, "ITEM", None, 1)
			Log(rule.link, "LINK", None, 1)
			Log(cost, "COST", None, 1)
			Log(chaos_cost, "CHAOS COST", None, 1)
			Log("@{} Hi, I would like to buy your {} listed for {} in {}\n".format(ign, item_name, cost, LEAGUE), "WHISPER", None, 1)
			
			if rule.alert:
				os.system("start C:\\Users\\Ramon\\workspace\\git\\rockwurst\\testsound.mp3")

			isFirst = False
	if len(hits) > 0:
		hitAvg = hitSum / hitCnt
		Log("took the first {} items with an avarage of {} chaos".format(hitCnt, hitAvg), "SCAN RESULT", "white", 0)
		
		DB.insert_scan(rule.link, hitAvg)
Beispiel #41
0
from scipy.io import loadmat, savemat
from scipy.misc import imread, imsave
import cPickle
import subprocess
import re
import glob

from Crawler import *
from ModalDB import *
from Settings import Settings

#Crawl videos
if True:
  c = Crawler()
  q='how+to+hard+boil+an+egg'
  VIDS = c.searchYoutube(q.decode('utf-8'),5)
  print VIDS
  print 'Dowload Them'
  c.downloadVideos()
  c.getSubtitles()
  with open('ids.bn', 'wb') as fp:
    cPickle.dump(VIDS, fp)

#VIDS = cPickle.load(open('ids.bn','rb'))

import os
modaldb_client = ModalClient(root=Settings.data_dir, schema=Settings.my_schema) # here, we specify the DB's location (on disk) and schema.
#modaldb_client.clear_db() # empty the database just in case.

#Pushing videos
for vid_id in VIDS: