def checker(): collection = load_collection() for song in collection: resp = r.get(song.url) if resp.status_code != 200: delete_from_collection(song.filename) remove_from_index(preprocess(song.lyrics, lemma=False), song.filename) soup = BS(resp.text, features='html.parser') # Get artist, title and text from the doc artist = soup.find(class_='lyric-artist').text[:-13] title = soup.find(class_='lyric-title').text lyrics = soup.find(class_='lyric-body').text song2 = Song(artist, title, lyrics, song.url) ''' !!!! Need to get tokens from the song and subtruct that doesn't in song2!!!!''' if song != song2: tokens_1 = preprocess(song.lyrics, lemma=False) tokens_2 = preprocess(song2.lyrics, lemma=False) # Find tokens not presented in updated to_remove = tokens_1 - tokens_2 remove_from_index(to_remove, song.filename) # Find tokens that new in updated to_update = tokens_2 - tokens_1 update_index(to_update, song.filename) print(f'CHECKED: {song.filename}')
def parse_loop(): logfile = config.logdir + "/aprs.log" os.system("mkdir -p " + config.logdir) os.system("touch " + logfile) #start aprs_decoder os.system("./aprs_decoder >%s&" % logfile) try: #start web updaters thread.start_new_thread(uploader_thread, ()) thread.start_new_thread(downloader_thread, ()) wait_start = True where = 0 while 1: file = open(logfile,'r') file.seek(where) d = file.readline() where = file.tell() file.close() if not d: time.sleep(0.1) else: if wait_start: #Check for correct config.sender address if d.startswith("AFSK1200: fm %s" % config.sender.upper()): wait_start = False else: wait_start = True if d[1:7].isdigit() and d[7] == 'h': name = d[1:7] elif config.log_all_messages: now = datetime.utcnow() name = "%02d%02d%02d" % (now.hour, now.minute, now.second) else: print "Unhandled message:", d.strip() continue msg_name = config.logdir + "/" + name found = False for msg in glob.glob(config.logdir + "/" + "[0-9]" * 6 + "*"): if msg.startswith(msg_name): print "Message", msg, "already present" found = True break if not found: utils.write_file(config.logdir + "/" + name, d) utils.write_file(config.logdir + "/" + name + ".unsent", d) utils.update_index(config.logdir) except KeyboardInterrupt: print "\nCTRL-C pressed, exit" finally: os.system("killall aprs_decoder")
#!/usr/bin/env python import config import utils def web_index(): return utils.http_get(config.msg_index_url).split() def parse_index(index): s = set() for f in index: s.add(f.strip()) return s if __name__ == "__main__": try: l = open(config.logdir + "/" + config.msg_index) local = parse_index(l) except IOError: local = set() web = parse_index(web_index()) diff_web = web - local for d in reversed(sorted(diff_web)): print "Getting", d msg = utils.http_get(config.base_url + d) utils.write_file(config.logdir + "/" + d, msg) utils.update_index(config.logdir)