def main():
    arg_parser = argparse.ArgumentParser(
      description='Mobile App Crawler Manual',
      formatter_class=RawTextHelpFormatter)
    arg_parser.add_argument('-m', '--method', 
      help=('crawl_new: Scrap PlayStore top 300 app information '
            'for each category\n'
            'crawl_old: Update collected app information\n'
            'update_apk: Download APK file'))
    arg_parser.add_argument('-d', '--desktop', 
      type=str2bool, default=True,
      help=('True(Default): Show web browser (use selenium)\n'
            'False: Do not show web browser (use virtual screen)'))

    args = arg_parser.parse_args()

    if(args.desktop != None):
        desktop = args.desktop

    if(args.method != None):
        method = args.method

    playstore_crawler = Crawler(is_desktop = desktop)

    if(method == "crawl_new"):
        playstore_crawler.crawl_new()
    elif(method == "crawl_old"):
        playstore_crawler.crawl_old()
    elif(method == "update_apk"):
        playstore_crawler.update_apk()

    playstore_crawler.close()
Example #2
0
    def get(self):
        cr = Crawler()

        url = "http://loaq.kr/"
        json_file = cr.start(url)

        return json_file
Example #3
0
def main():
    arg_parser = argparse.ArgumentParser(description='APK크롤러 실행법')
    arg_parser.add_argument('--method',
                            help='실행시키고자 하는 기능(crawl_new,\
        crawl_old, update_apk)')
    arg_parser.add_argument('--desktop', type=str2bool, default=True,\
        help='Desktop으로 실행시키려면 true, Server로 실행시키려면 false')

    args = arg_parser.parse_args()

    if (args.desktop != None):
        desktop = args.desktop

    if (args.method != None):
        method = args.method

    playstore_crawler = Crawler(is_desktop=desktop)

    if (method == "crawl_new"):
        playstore_crawler.crawl_new()
    elif (method == "crawl_old"):
        playstore_crawler.crawl_old()
    elif (method == "update_apk"):
        playstore_crawler.update_apk()

    playstore_crawler.close()
Example #4
0
def getPageMorts(url, pageFilter):
    crawl = Crawler(url, pageFilter=pageFilter)
    urlsMort = []
    for page in crawl:
        print("HttpCode:%d     Url: %s " % (page.codeHTTP, page.url))

        if page.codeHTTP not in range(200, 300):
            urlsMort.append((page.codeHTTP, page.url))
        # a new dictionary to stock the results {url who has the dead liens: [dead liens]}
        pageParents = {}
        for url in urlsMort:
            for pageParent in crawl.pagesToCrawl_dict[url[1]]:
                if pageParent in pageParents:
                    if url[0] in pageParents[pageParent]:
                        pageParents[pageParent][url[0]].append(url[1])
                    else:
                        pageParents[pageParent][url[0]] = [url[1]]
                else:
                    pageParents[pageParent] = {url[0]: [url[1]]}

    print "\n Crawler Complet!\n"
    with open('liensMort.txt', 'w') as dump_file:
        for pageParent in pageParents:
            dump_file.write('Dans la page : \n{}\n'.format(pageParent))
            dump_file.write('\n')
            codeHTTP = pageParents[pageParent].keys()
            codeHTTP.sort()
            for code in codeHTTP:
                dump_file.write('HTTP return code {}\n'.format(code))
                for url in pageParents[pageParent][code]:
                    dump_file.write('        {}\n'.format(url))
            dump_file.write('*' * 80 + '\n\n')
Example #5
0
    def findSimilar(self,link,limit):

        # we call the read text function from the Crawler to read the new link
        # we use the construcotr with empty variables
        crawler = Crawler.Crawler('',0,0,0)


        self.limit = limit
        file = open("Data/page%d.txt" % self.limit, 'w')

        try:
            self.title , text = crawler.getText(link)
            # we combine the lists of string to a single string
            text = ''.join(text)
            for t in text:
                file.write(t)
            file.close()
        except:
            "Link is not accesible"
            file.close()
            sys.exit(0)

        indexer = Indexer.Indexer()
        indexer.start()

        cosineSimilarity = indexer.getCosineSimilarity()



        linksId = [ i for i in range(self.limit)]

        linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)]

        return cosineSimilarity , linksIdSorted
Example #6
0
def main():
    stocknum = str(600000)
    total = dict()
    for i in range(1, 10):
        page = str(i)
        crawler = Crawler(stocknum, page)
        datalist = crawler.getData()
        comments = File(stocknum + '_page_' + page, 'json', './data/')
        comments.inputData(datalist)
        data = open('./data/' + stocknum + '_page_' + page + '.json',
                    'r').read()
        jsonData = json.loads(data)
        for detail in jsonData:
            num = '1' if '年' not in detail['age'].encode(
                'utf-8') else detail['age'].encode('utf-8').replace('年', '')
            num = float(num)
            date = detail['time'][4:14].encode('utf-8')
            total[date] = total[date] if date in total.keys() else {
                'num': 0,
                'content': 0
            }
            total[date]['num'] = total[date]['num'] + num if total[date][
                'num'] else num
            total[date]['content'] = total[date][
                'content'] + detail['content'] * num if total[date][
                    'content'] else detail['content'] * num
    total = json.dumps(total)
    totalfile = File(stocknum, 'json', './data/')
    totalfile.inputData(total)
Example #7
0
def storeUrl4Comp():
    db = pymysql.connect("localhost",
                         "root",
                         "root",
                         "kg4company_2",
                         charset="utf8")
    cur = db.cursor()
    sql1 = u"select name from city;"
    reCount = cur.execute(sql1)  # 返回受影响的行数
    cities1 = cur.fetchall()
    sql2 = u"select name from industry;"
    cur.execute(sql2)
    cities2 = cur.fetchall()
    cities = cities1 + cities2
    sql3 = u"insert into urlList(url4comp) values('{name}');"
    cities = cities[71:]
    for city in cities:
        print city[0]
        for idx in range(1, 31):
            print('page: ' + str(idx))
            cwler = cw.Crawler()
            cwler.getCompUrl(idx, city[0])
            list_ = cwler.listComp
            if len(list_) == 0:
                break
            for li in list_:
                print li
                sqlN = sql3.format(name=li)
                cur.execute(sqlN)
                db.commit()
            time.sleep(2)
            print('page: ' + str(idx) + ' is done')
        time.sleep(2)
        print(city[0] + ' is done')
Example #8
0
 def __init__(self, busca, distancia=5):
     self.ponto_inicial = 'http://www.rugbyfluminense.com.br/'
     self.distancia = 2
     self.crawler = Crawler.Crawler()
     self.criar_no = Criar_No.Criar_No
     self.catalogo = []
     self.busca = busca  #### termo buscado
     self.grafo = Grafo.Grafo()
Example #9
0
    def __init__(self, __textUrlData, __pageRankData):
        # instantiates data / service classes
        self.__textData = __textUrlData;
        self.__pageRankData = __pageRankData;
        self.__webscraper = WebScrape(self.__textData, self.__pageRankData);
        self.__crawler = Crawler(self.__textData);

        # loads all data structures by calling lower tier helper classes
        self.__generate_data_structures();
def parse_home():
    try:
        indice_link = 0
        for url in HOME_URL:
            instancia_crawler_main = Crawler.Crawler(url,
                                                     NUEVO_LINK[indice_link])
            instancia_crawler_main.crawler_main(indice_link)
            indice_link += 1
    except ValueError as ve:
        print(ve)
Example #11
0
def to_url(request):
    s = request.GET.get('text')
    s = s.split(" ")
    r = []
    for i in s:
        if i != " " and i !="":
            r.append(i)
    a= Crawler.Crawler(0, 1, r, '/home/katrin/databasetemp/')
    try:
        a.downloadPages()
    except Exception, e:
        pass
Example #12
0
def main():
    # starting url
    url = "https://en.wikipedia.org/wiki/London"
    # limit of pages
    limit = 10
    # bool variable to keep or delete previous data read
    delete = True
    # number of threads
    threadsNumber = 3



    t0 = time.time()
    # by calling the Crawler the threads start collecting data from the links
    crawler = Crawler.Crawler(url, limit, delete, threadsNumber,printBool=False)
    crawler.start()
    print("Read ", limit, " pages, using ", threadsNumber, " threads in ", time.time() - t0, " seconds.")

    queryProcessor = QueryProcessor()

    link = "https://en.wikipedia.org/wiki/Paris"

    # the k most similar links
    k = 4


    similarities , links = queryProcessor.findSimilar(link,limit)
    title = queryProcessor.getTitle()

    titles = crawler.getTitles()
    urls = crawler.getLinks()

    print()
    print(k," pages most similar to ",title)
    print()
    for i in range(k):
        x = links[i]
        print("With ",similarities[x]," similarity is",titles[x]," link: ",urls[x])

    print()
    print("Completed in ",time.time() -t0," seconds")

    # we delete the last page of the Data file,
    # this page is the page we just compared, by removing it
    # we can re-run the program with another page
    path, dirs, files = next(os.walk("Data"))
    last =  len(files) -1

    os.remove("Data/page%i.txt" % last)
Example #13
0
 def subjectActivated(self, text):
     self.subject = text
     if self.subject == "Mathemastics":
         self.crawler = Crawler.Crawler("math")
     elif self.subject == "Physics":
         self.crawler = Crawler.Crawler("physics")
     elif self.subject == "Computer Science":
         self.crawler = Crawler.Crawler("cs")
     elif self.subject == "Quantitative Biology":
         self.crawler = Crawler.Crawler("q-bio")
     elif self.subject == "Quantitative Finance":
         self.crawler = Crawler.Crawler("q-fin")
     elif self.subject == "Statistics":
         self.crawler = Crawler.Crawler("stat")
Example #14
0
    def __init__(self, config):
        print(color + 'Env created')
        self.config = config
        self.default_action = config['Env_config']['default_action']

        [self.angle_vect_size, self.pressure_vect_size,
         self.reward_vect_size] = self.config['Sim_config']['obs_vector_space']
        print(color, 'obs vect space', self.angle_vect_size,
              self.pressure_vect_size, self.reward_vect_size)
        self.vect_size = self.angle_vect_size + self.pressure_vect_size + self.reward_vect_size
        self.action_space_size = self.angle_vect_size

        self.data = ""
        self.crawler = Crawler.Crawler(config=self.config)

        self.cycle_id = 0
        self.cycle_reset_period = self.config['Env_config'][
            'cycle_reset_period']

        self.temp_agent_reward_que = send_que = multiprocessing.Queue()
        self.reward_delay_length = 10
Example #15
0
def CrawlBegin(paras, threadname, taskque, crawl_function):
    download_count = 0
    restart_count = 0

    #若捕捉到错误status=0,则重新生成实例Crawler则打印错误,循环
    #直至收到任务完成信号status=1,循环停止
    while True:
        try:
            print threadname, "successfully started"
            c = Crawler.Crawler(threadname, paras)
            status, download_count_iter = c.Crawling(threadname, taskque,
                                                     crawl_function)
            download_count += download_count_iter
            if status == 1:
                print threadname, "successfully finished", time.ctime()
                break
            restart_count += 1
        except Exception, e:
            print "(Spider)", e
            traceback.print_exc()
            continue
def spider_thread(city):
	iii=0
	for idx in range(1, 30):
		cwler = cw.Crawler()
		cwler.getCompUrl(idx,city)
		list_=cwler.listComp
		print list_
		for li in list_:
			iii=iii+1
			print city+' company:'+str(iii)
			ci = connectWeb.compInfo()
			pri = connectWeb.projectInfo()
			pei = connectWeb.peopleInfo()
			ca= connectWeb.compAptitude()
			str4comp=ci.getInfo(li)
			pri.getInfo(li,str4comp)
			pei.getInfo(li,str4comp)
			ca.getInfo(li,str4comp)
			time.sleep(3)
			print city+' This company is done, take a rest'
		time.sleep(5)
		print city+' This page is done, take a rest'
Example #17
0
def main():
	print("Starting our Web Crawler")
	baseUrl = input("Website > ")
	numberOfThreads = input("No Threads > ")

	linksToCrawl = queue.Queue()
	urlLock = threading.Lock()
	linksToCrawl.put(baseUrl)
	haveVisited = []
	crawlers = []
	errorLinks = []

	with open("links.txt", "w+") as f:
		for i in range(int(numberOfThreads)):
			crawler = Crawler(baseUrl, linksToCrawl, haveVisited, errorLinks, urlLock, f)
			crawler.run()
			crawlers.append(crawler)
		
		for crawler in crawlers:
			crawler.join()

	print("Total Number of Pages Visited {}".format(len(haveVisited)))
	print("Total Number of Pages with Errors {}".format(len(errorLinks)))
Example #18
0
def check0x00(web):

    print(R + '\n   ====================================================')
    print(R + '    C R O S S   S I T E   R E Q U E S T   F O R G E R Y')
    print(R + '   ====================================================')
    time.sleep(0.7)
    print(O + ' [This module has only full support for domains of startpages]')
    print(O + '   [Hence, may not satisfactorily work for all domains]\n')

    if 'http' not in web:
        web = 'http://' + web

    # Just to make sure BeautifulSoup is working properly :)
    form1 = """<form action="/drupal/?q=node&amp;destination=node"  accept-charset="UTF-8" method="post" id="user-login-form">
    <div><div class="form-item" id="edit-name-wrapper">
     <label for="edit-name">Username: <span class="form-required" title="This field is required.">*</span></label>
     <input type="text" maxlength="60" name="name" id="edit-name" size="15" value="test1" class="form-text required" />
    </div>
    <div class="form-item" id="edit-pass-wrapper">
     <label for="edit-pass">Password: <span class="form-required" title="This field is required.">*</span></label>
     <input type="password" value="a9z8e7" name="pass" id="edit-pass"  maxlength="60"  size="15"  class="form-text required" />
    </div>
    <input type="submit" name="op" id="edit-submit" value="Log in"  class="form-submit" />
    <div class="item-list"><ul><li class="first"><a href="/drupal/?q=user/register" title="Create a new user account.">Create new account</a></li>
    <li class="last"><a href="/drupal/?q=user/password" title="Request new password via e-mail.">Request new password</a></li>
    </ul></div><input type="hidden" name="form_build_id" id="form-6a060c0861888b7321fab4f5ac6cb908" value="form-6a060c0861888b7321fab4f5ac6cb908"  />
    <input type="hidden" name="form_id" id="edit-user-login-block" value="user_login_block"  />
    </div></form> """

    form2 = """<form action="/drupal/?q=node&amp;destination=node"  accept-charset="UTF-8" method="post" id="user-login-form">
    <div><div class="form-item" id="edit-name-wrapper">
     <label for="edit-name">Username: <span class="form-required" title="This field is required.">*</span></label>
     <input type="text" maxlength="60" name="name" id="edit-name" size="15" value="test2" class="form-text required" />
    </div>
    <div class="form-item" id="edit-pass-wrapper">
     <label for="edit-pass">Password: <span class="form-required" title="This field is required.">*</span></label>
     <input type="password" value="a9z8e7" name="pass" id="edit-pass"  maxlength="60"  size="15"  class="form-text required" />
    </div>
    <input type="submit" name="op" id="edit-submit" value="Log in"  class="form-submit" />
    <div class="item-list"><ul><li class="first"><a href="/drupal/?q=user/register" title="Create a new user account.">Create new account</a></li>
    <li class="last"><a href="/drupal/?q=user/password" title="Request new password via e-mail.">Request new password</a></li>
    </ul></div><input type="hidden" name="form_build_id" id="form-6a060c0861888b7321fab4f5ac6cb908" value="form-6a060c0861888b7321fab4f5ac6cb908"  />
    <input type="hidden" name="form_id" id="edit-user-login-block" value="user_login_block"  />
    </div></form> """

    Cookie0 = cookielib.CookieJar()
    Cookie1 = cookielib.CookieJar()

    resp1 = urllib2.build_opener(urllib2.HTTPCookieProcessor(Cookie0))
    resp2 = urllib2.build_opener(urllib2.HTTPCookieProcessor(Cookie1))

    actionDone = []

    csrf = ''
    init1 = web
    form = Form()

    # Hope it works properly (no lxml error ;=;)
    bs1 = BeautifulSoup(form1).findAll('form', action=True)[0]
    bs2 = BeautifulSoup(form2).findAll('form', action=True)[0]

    action = init1

    resp1.open(action)
    resp2.open(action)

    crawler = Crawler(init1, resp1)
    print(GR + " [*] Initializing crawling...")

    global url
    try:

        while crawler.noinit():
            url = crawler.next()
            print(C + ' [+] Crawling :> ' + B + url)

            try:
                soup = crawler.process(web)
                if not soup:
                    continue

                i = 0
                print(O + ' [*] Retrieving all forms on ' + C + url + O +
                      '...')
                for m in getAllForms(soup):
                    action = uri.buildAction(url, m['action'])
                    if not action in actionDone and action != '':
                        try:
                            print()
                            result = form.prepareFormInputs(m)
                            r1 = request(url, action, result, resp1)
                            result = form.prepareFormInputs(m)
                            r2 = request(url, action, result, resp2)

                            if (len(csrf) > 0):
                                if not re.search(csrf, r2):
                                    print(
                                        G +
                                        '[+] Looks like we got a CSRF vulnerability on '
                                        + O + url + G + '!\n')
                                    try:
                                        if m['name']:
                                            print(R + '\n  =====')
                                            print(R + '   PoC')
                                            print(R + '  =====\n')
                                            print(B + ' [+] URL : ' + P + url)
                                            print(C + ' [+] Name : ' + O +
                                                  m['name'])
                                            print(G + ' [+] Action : ' + O +
                                                  m['action'])

                                    except KeyError:

                                        print(R + '\n  =====')
                                        print(R + '   PoC')
                                        print(R + '  =====\n')
                                        print(B + ' [+] URL : ' + P + url)
                                        print(G + ' [+] Action : ' + O +
                                              m['action'])

                                    print(O + ' [+] Code : ' + W +
                                          urllib.urlencode(result))
                                    print('')

                                continue

                            o2 = resp2.open(url).read()

                            try:
                                form2 = getAllForms(BeautifulSoup(o2))[i]

                            except IndexError:
                                print(R + ' [-] Form Error')
                                continue

                            contents2 = form.prepareFormInputs(form2)
                            r3 = request(url, action, contents2, resp2)

                            checkdiff = difflib.ndiff(r1.splitlines(1),
                                                      r2.splitlines(1))
                            checkdiff0 = difflib.ndiff(r1.splitlines(1),
                                                       r3.splitlines(1))

                            result12 = []
                            for n in checkdiff:
                                if re.match('\+|-', n):
                                    result12.append(n)
                            result13 = []
                            for n in checkdiff0:
                                if re.match('\+|-', n):
                                    result13.append(n)

                            if len(result12) <= len(result13):

                                try:
                                    if m['name']:

                                        print(R + '\n  =====')
                                        print(R + '   PoC')
                                        print(R + '  =====\n')
                                        print(B + ' [+] URL : ' + P + url)
                                        print(C + ' [+] Name : ' + O +
                                              m['name'])
                                        print(G + ' [+] Action : ' + W +
                                              m['action'])

                                except KeyError:

                                    print(R + '\n  =====')
                                    print(R + '   PoC')
                                    print(R + '  =====\n')
                                    print(B + ' [+] URL : ' + P + url)
                                    print(G + ' [+] Action : ' + W +
                                          m['action'])

                                print(O + ' [+] Code : ' + W +
                                      urllib.urlencode(result))
                                print('')

                        except urllib2.HTTPError as msg:
                            print(msg.__str__())
                            pass

                    actionDone.append(action)
                    i += 1

            except urllib2.URLError as e:
                print(R + ' [-] Exception at %s' % url)
                print(R + ' [-] Error : ' + str(e))
                continue

    except KeyboardInterrupt:
        print(R + "\n [-] Interrupted by user")
Example #19
0

def processPaper(paperId):
    content = cr.crawlPaperMain(paperId)
    soup = BeautifulSoup(content)
    paper = pr.getPaperInfo(soup)
    paper.id = paperId
    outPut_sql(paper)


def processUser(userId):
    content = cr.crawlAuthorPub(userId)
    pr.parseAuthorPub(content)


cr = Crawler.Crawler()
pr = Parse.Parse()
sql = SQLConn.MysqlUti()

Init(pr, sql)
pr.testPaper('2488441')
paperId = sql.getPaper()

while ((paperId != None) or (sql.getUser() != None)):
    while paperId != None:
        while (checkTime() == False):
            print('sleeping... ')
            time.sleep(1.5 * 3600)

        print(paperId)
        try:
                        '--output-encoding',
                        help='encoding for shell',
                        default=OUTPUT_ENCODING)

    parser.set_defaults(map_save_to_dot=MAP_SAVE_TO_DOT)

    args = parser.parse_args()

    print("Running configuration:")
    for attr, value in sorted(args.__dict__.iteritems()):
        print("\t{0} = {1}".format(attr, value))

    c = Crawler(args.language, args.currency, args.user_place,
                args.departure_point, args.ignored_points,
                args.departure_month, args.departure_year, args.price_limit,
                args.flights_limit, args.selenium_host, args.selenium_port,
                args.selenium_start_cmd, args.selenium_load_timeout,
                args.map_save_to_dot, args.map_dot_filename,
                args.output_encoding)

    try:
        c.create_map()
        c.analyze_map()
    except KeyboardInterrupt:
        print("Ctrl-C pressed...")
    except SeleniumError as err:
        print(str(err))
    except Exception as err:
        traceback.print_exc(file=sys.stdout)
    finally:
        c.cleanup()
Example #21
0
from Crawler import *

c = Crawler()
c.movie_info(8)
def print_signature():
    print('═══════════════════════════════════════════════════════════════')
    print('███████╗██╗   ██╗██╗████████╗███████╗   ██╗      █████╗ ██████╗')
    print('██╔════╝██║   ██║██║╚══██╔══╝██╔════╝   ██║     ██╔══██╗██╔══██╗')
    print('███████╗██║   ██║██║   ██║   █████╗     ██║     ███████║██████╔╝')
    print('╚════██║██║   ██║██║   ██║   ██╔══╝     ██║     ██╔══██║██╔══██╗')
    print('███████║╚██████╔╝██║   ██║   ███████╗██╗███████╗██║  ██║██████╔╝')
    print('╚══════╝ ╚═════╝ ╚═╝   ╚═╝   ╚══════╝╚═╝╚══════╝╚═╝  ╚═╝╚═════╝')
    print('═══════════════════════════════════════════════════════════════')
    print('                    블로그 사진 크롤러                         ')
    print('                    develop by woosik yoon [suitelab.github.io]')
    print('═══════════════════════════════════════════════════════════════')


if __name__ == "__main__":
    print_signature()
    cr = Crawler()

    while True:
        url = input('블로그 주소를 입력하세요(종료는 exit 입력) : ')
        if url.upper() == 'EXIT':
            cr.driver.close()
            sys.exit(1)

        if not url_validate(url):
            print('잘못된 주소입니다.')
            continue

        cr.start(url)
        print(
            '═══════════════════════════════════════════════════════════════')
Example #23
0
 def __init__(self, project_name, url):
     CrawlerUrl.project_name = project_name
     CrawlerUrl.queue_file = 'storage/' + CrawlerUrl.project_name + '/queue.txt'
     CrawlerUrl.crawled_file = 'storage/' + CrawlerUrl.project_name + '/crawler.txt'
     Crawler(CrawlerUrl.project_name, url, url)
def main(base_url, max_count, flag):
    craw = crawler.Crawler(base_url)
    craw.crawling(base_url, max_count, flag)
Example #25
0
 def __init__(self, master=None):
     Frame.__init__(self, master)
     self.pack()
     self.createWidgets()
     self.c = C.Crawler()  #crawler module
Example #26
0
#!/usr/bin/python
from PyQt4 import QtCore, QtGui
import Crawler
import warnings
import time

# Global variables
warnings.filterwarnings("ignore")
cr = Crawler.Crawler("it")
message = ""

try:
    _fromUtf8 = QtCore.QString.fromUtf8
except AttributeError:

    def _fromUtf8(s):
        return s


try:
    _encoding = QtGui.QApplication.UnicodeUTF8

    def _translate(context, text, disambig):
        return QtGui.QApplication.translate(context, text, disambig, _encoding)
except AttributeError:

    def _translate(context, text, disambig):
        return QtGui.QApplication.translate(context, text, disambig)


class Ui_Form(QtGui.QWidget):
Example #27
0
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')

from Tkinter import *
import tkinter.messagebox

from Crawler import *

AGS = AGSpiel()
C = Crawler(AGS)

from tkinter import *
import tkinter.messagebox

app = Tk()
app.title("AG-Spiel Crawler")
app.geometry("1000x600+100+100")


def AuslesenStarten():
    inArb.set("In Arbeit")
    app.update()
    Eingabe = Eingabefeld.get().split(",")
    txtAusgabe.delete("1.0", END)
    for i in range(0, len(Eingabe)):
        Eingabe[i] = Eingabe[i].replace(" ", "")
        print Eingabe[i]
        C.AG_Hinzufuegen(AG(AGS, i, Eingabe[i]))
    for meineAG in C.get_AGListe():
Example #28
0
import Crawler as c
import pandas as pd
import os
from datetime import datetime
import time

crawler = c.Crawler()
crawler.crawl()

df = pd.DataFrame()
count = 1
print("Start merging..")
for i in os.listdir("./raw/"):
    print(str(count) + ". file: " + str(i))
    count += 1
    df = df.append(pd.read_csv("./raw/" + str(i),
                               sep=";",
                               encoding="utf-8",
                               decimal=","),
                   sort=False)

df = df.drop_duplicates(subset="URL")

succ = df.to_csv('./data/' +
                 str(datetime.now())[:19].replace(':', '').replace('.', '') +
                 '.csv',
                 sep=';',
                 decimal=',',
                 encoding='utf-8',
                 index_label='timestamp')
Example #29
0
def crawl():
    for i in range(len(urls)):
        crawler = Crawler.Crawler(urls[i], filesName[i])
        crawler.start()
Example #30
0
"""Pull data from geth and parse it into mongo."""

import Crawler

print("Booting processes.")
c = Crawler.Crawler()
print("Update complete.")