def _import_db_backup(self): """Adds new ofer from RSS feed.""" # connects to input database # all offers consolidated # conn = sqlite3.connect('elpaso_github.sqlite') # db_cursor = conn.cursor() # db_cursor.execute("SELECT * FROM georezo") # all offers consolidated conn = sqlite3.connect('elpaso_new.sqlite') db_cursor = conn.cursor() db_cursor.execute("SELECT * FROM elpaso_annonce") input_rows = db_cursor.fetchall() print(len(input_rows)) # closing input db connection conn.close() for input_row in input_rows: print(input_row[0], type(input_row), len(input_row)) # check if id already exists if GeorezoRSS.objects.filter(id_rss = input_row[0]).exists(): logging.error("Offer ID already exists: {}".format(input_row[0])) continue else: pass # formatting date publication_date = arrow.get(input_row[3], "ddd, D MMM YYYY HH:mm:ss Z") try: offer = GeorezoRSS(id_rss=input_row[0], title=input_row[1], content=input_row[2], date_pub=publication_date.format(), source=1) offer.save() logging.info("New offer added: {}".format(input_row[0])) except IntegrityError: # in case of duplicated offer logging.error("Offer ID already exists: {}".format(input_row[0])) continue except Exception as e: logging.error(e)
def _add_new_offers(self): """Adds new ofer from RSS feed.""" # Get the id of the last offer parsed with open(path.abspath(r"last_id_georezo.txt"), "r") as fichier: last_id = int(fichier.readline()) logger.info("Previous offer ID: {}".format(last_id)) # list to store offers IDs li_id = [] # reset offers counter compteur = 0 # RSS parser feed = feedparser.parse("http://georezo.net/extern.php?fid=10") logger.info("Parser created") # looping on feed entries for entry in feed.entries: # get the ID cleaning 'link' markup try: job_id = int(entry.id.split("#")[1].lstrip("p")) except AttributeError(e): logger.error("Feed index corrupted: {} - ({})".format(feed.entries.index(entry), e)) continue # first offer parsed is the last published, so the biggest ID. Put the ID in # the text file dedicated. if feed.entries.index(entry) == 0: with open(path.abspath(r"last_id_georezo.txt"), "w") as fichier: fichier.write(str(job_id)) else: pass # formating publication date publication_date = arrow.get(entry.published, "ddd, D MMM YYYY HH:mm:ss Z") # if the entry's ID is greater than ID stored in the file, that means # the offer is more recent and has not been processed yet. if job_id > last_id: try: offer = GeorezoRSS( id_rss=job_id, title=entry.title, content=entry.summary, date_pub=publication_date.format(), source=True, ) offer.save() # incrementing counter compteur += 1 # adding the offer's ID to the list of new offers to process li_id.append(job_id) logger.info("New offer added: {}".format(job_id)) except IntegrityError: # in case of duplicated offer logger.error("Offer ID already exists: {}".format(job_id)) continue except Exception(e): logger.error(e) else: logger.info("Offer ID inferior to the last registered: {}".format(job_id)) pass # after loop logger.info("{} offers have been added.".format(compteur)) # if new offers => launch next processes if compteur > 0: # log info logger.info("New offers IDs: " + str(li_id)) # analyzing offers Analizer(li_id) # # fillfulling the DB # models.Fillin(li_id) else: pass