class EmailToOrders: def __init__(self, config): self.retriever = ObjectRetriever(config) self.email_to_orders: Dict[str, List[Order]] = self.retriever.load(EMAIL_TO_ORDERS_FILENAME) @debounce(1) def flush(self): self.retriever.flush(self.email_to_orders, EMAIL_TO_ORDERS_FILENAME) def get_orders(self, mail, email_id): if email_id not in self.email_to_orders: _, data = mail.uid("FETCH", email_id, "(RFC822)") msg = email.message_from_string(str(data[0][1], 'utf-8')) date = datetime.datetime.strptime(msg['Date'], '%a, %d %b %Y %H:%M:%S %z').strftime('%Y-%m-%d') to_email = str(msg['To']).replace('<', '').replace('>', '') raw_email = str(data[0][1]).replace("=3D", "=").replace('=\\r\\n', '').replace('\\r\\n', '').replace('&', '&') order_ids = AmazonTrackingRetriever.get_order_ids_from_email(AmazonTrackingRetriever, raw_email) self.email_to_orders[email_id] = [ Order(order_id, date, to_email, False) for order_id in order_ids ] self.flush() return self.email_to_orders[email_id]
class NonPortalReimbursements: def __init__(self, config): self.retriever = ObjectRetriever(config) self.trackings_to_costs: Dict[Tuple[str], Tuple[ str, float]] = self.retriever.load(NON_PORTAL_TRACKINGS_FILENAME) self.po_to_cost: Dict[str, float] = self.retriever.load( NON_PORTAL_POS_FILENAME) def flush(self): self.retriever.flush(self.trackings_to_costs, NON_PORTAL_TRACKINGS_FILENAME) self.retriever.flush(self.po_to_cost, NON_PORTAL_POS_FILENAME)
class ArchiveManager: def __init__(self, config): self.retriever = ObjectRetriever(config) self.archive_dict = self.retriever.load(ARCHIVES_FILENAME) def get_archive(self, name): return self.archive_dict[name] def has_archive(self, name): return name in self.archive_dict def put_archive(self, name, trackings_info, po_cost) -> None: self.archive_dict[name] = (trackings_info, po_cost) self.retriever.flush(self.archive_dict, ARCHIVES_FILENAME)
class TrackingOutput: def __init__(self, config) -> None: self.retriever = ObjectRetriever(config) def save_trackings(self, trackings, overwrite=False) -> None: old_trackings = self.get_existing_trackings() merged_trackings = self.merge_trackings(old_trackings, trackings, overwrite) self._write_merged(merged_trackings) def get_tracking(self, tracking_number) -> Optional[Tracking]: """Returns the tracking object with the given tracking number if it exists.""" existing_trackings = self.get_existing_trackings() for tracking in existing_trackings: if tracking.tracking_number == tracking_number: return tracking return None def _write_merged(self, merged_trackings) -> None: groups_dict = collections.defaultdict(list) for tracking in merged_trackings: groups_dict[tracking.group].append(tracking) self.retriever.flush(groups_dict, TRACKINGS_FILENAME) # Adds each Tracking object to the appropriate group # if there isn't already an entry for that tracking number def merge_trackings(self, old_trackings: List[Tracking], trackings: List[Tracking], overwrite: bool) -> List[Tracking]: new_tracking_dict = {t.tracking_number: t for t in old_trackings} for tracking in trackings: if tracking.tracking_number not in new_tracking_dict or overwrite: new_tracking_dict[tracking.tracking_number] = tracking return list(new_tracking_dict.values()) def get_existing_trackings(self) -> List[Tracking]: trackings_dict = self.retriever.load(TRACKINGS_FILENAME) return self._convert_to_list(trackings_dict) def _convert_to_list(self, trackings_dict): result = [] for trackings in trackings_dict.values(): result.extend(trackings) for tracking in result: tracking.tracking_number = tracking.tracking_number.upper() return result
class EmailToOrders: def __init__(self, config): self.retriever = ObjectRetriever(config) self.email_to_orders: Dict[str, List[Order]] = self.retriever.load( EMAIL_TO_ORDERS_FILENAME) @debounce(1) def flush(self): self.retriever.flush(self.email_to_orders, EMAIL_TO_ORDERS_FILENAME) def get_orders(self, mail, email_id): if email_id not in self.email_to_orders: for attempt in range(3): try: _, data = mail.uid("FETCH", email_id, "(RFC822)") break except: tqdm.write( f"Got exception, attempting retry up to 3 times...\n{util.get_traceback_lines()}" ) mail = email_auth.email_authentication() mail.select('"[Gmail]/All Mail"') else: raise Exception("Exceeded retry limit") msg = email.message_from_string(str(data[0][1], 'utf-8')) date = datetime.datetime.strptime( msg['Date'], '%a, %d %b %Y %H:%M:%S %z').strftime('%Y-%m-%d') to_email = str(msg['To']).replace('<', '').replace('>', '') raw_email = str(data[0][1]).replace("=3D", "=").replace( '=\\r\\n', '').replace('\\r\\n', '').replace('&', '&') order_ids = AmazonTrackingRetriever.get_order_ids_from_email( AmazonTrackingRetriever, raw_email) self.email_to_orders[email_id] = [ Order(order_id, date, to_email, False) for order_id in order_ids ] self.flush() return self.email_to_orders[email_id]
class OrderInfoRetriever: """ A class that parses and stores the order numbers and email IDs for shipments. """ def __init__(self, config) -> None: self.retriever = ObjectRetriever(config) self.orders_dict = self.retriever.load(ORDERS_FILENAME) self.mail = self.load_mail() def load_mail(self): mail = email_auth.email_authentication() mail.select('"[Google Mail]/All Mail"') return mail @debounce(5) def flush(self) -> None: self.retriever.flush(self.orders_dict, ORDERS_FILENAME) def get_order_info(self, order_id, fetch_from_email: bool = True) -> OrderInfo: # Always fetch if we've never seen this order before, additionally fetch iff # we found a 0 or MISSING_COST_SENTINEL cost before and we want to retry. if order_id not in self.orders_dict or ( fetch_from_email and (self.orders_dict[order_id].cost == 0 or isclose(self.orders_dict[order_id].cost, MISSING_COST_SENTINEL))): from_email = self.load_order_total(order_id) if not from_email: from_email = {order_id: OrderInfo(None, MISSING_COST_SENTINEL)} self.orders_dict.update(from_email) self.flush() return self.orders_dict[order_id] def load_order_total(self, order_id: str) -> Dict[str, OrderInfo]: if order_id.startswith("BBY01"): return self.load_order_total_bb(order_id) else: return self.load_order_total_amazon(order_id) def load_order_total_bb(self, order_id: str) -> Dict[str, OrderInfo]: from_email = '*****@*****.**' email_id, email_str = self.get_relevant_raw_email_data(order_id, from_email) if not email_str: print("Could not find email for order ID %s" % order_id) return {} regex_subtotal = r'Subtotal[^\$]*\$([\d,]+\.[\d]{2})' regex_tax = r'Tax[^\$]*\$([\d,]+\.[\d]{2})' subtotal_match = re.search(regex_subtotal, email_str) if not subtotal_match: return {} subtotal = float(subtotal_match.group(1).replace(',', '')) tax_match = re.search(regex_tax, email_str) if not tax_match: return {} tax = float(tax_match.group(1).replace(',', '')) return {order_id: OrderInfo(email_id, subtotal + tax)} def load_order_total_amazon(self, order_id: str) -> Dict[str, OrderInfo]: email_id, email_str = self.get_relevant_raw_email_data(order_id, '*****@*****.**') if not email_str: tqdm.write(f"Could not find email for order ID {order_id}.") return {} regex_pretax = r'Total Before Tax:[^$]*\$([\d,]+\.\d{2})' regex_est_tax = r'Estimated Tax:[^$]*\$([\d,]+\.\d{2})' regex_order_total = r'Order Total:[^$]*\$([\d,]+\.\d{2})' regex_order = r'(\d{3}-\d{7}-\d{7})' orders_with_duplicates = re.findall(regex_order, email_str) orders = [] for order in orders_with_duplicates: if order not in orders: orders.append(order) # Sometimes it's been split into multiple orders. Find totals for each pretax_totals = [float(cost.replace(',', '')) for cost in re.findall(regex_pretax, email_str)] if pretax_totals: taxes = [float(cost.replace(',', '')) for cost in re.findall(regex_est_tax, email_str)] order_infos = [OrderInfo(email_id, t[0] + t[1]) for t in zip(pretax_totals, taxes)] return dict(zip(orders, order_infos)) else: # personal emails might not have the regexes, need to do something different personal_result = self.get_personal_amazon_totals(email_id, email_str, orders) if personal_result: return personal_result else: # amazon sometimes uses a new, odd format that only shows a single order total overall_totals = [ float(cost.replace(',', '')) for cost in re.findall(regex_order_total, email_str) ] order_infos = [OrderInfo(email_id, t) for t in overall_totals] return dict(zip(orders, order_infos)) def get_relevant_raw_email_data(self, order_id: str, from_email: str) -> Tuple[Optional[str], Optional[str]]: status, search_result = self.mail.uid('SEARCH', None, f'BODY "{order_id}"', f'FROM "{from_email}"') email_id = search_result[0] if not email_id: return None, None email_ids = search_result[0].decode('utf-8').split() if not email_ids: return None, None email_str = email_tracking_retriever.get_email_content(email_ids[0], self.mail) email_str = email_str.replace('\r\n', '') return email_ids[0], email_str def get_personal_amazon_totals(self, email_id, email_str, orders) -> Dict[str, OrderInfo]: soup = BeautifulSoup(email_str, features="html.parser") prices = [ elem.getText().strip().replace(',', '').replace('$', '') for elem in soup.find_all('td', {"class": "price"}) ] prices = [float(price) for price in prices if price] result = {} # prices alternate between pretax / tax for i in range(len(prices) // 2): total = prices[i * 2] + prices[i * 2 + 1] result[orders[i]] = OrderInfo(email_id, total) return result
class CancelledItemsRetriever: def __init__(self, config): self.retriever = ObjectRetriever(config) # map of {email_id: {order_id: cancelled_items}} self.email_id_dict = self.retriever.load(CANCELLATIONS_FILENAME) # returns map of order_id -> def get_cancelled_items(self) -> Dict[str, List[str]]: mail = self.load_mail() all_email_ids = self.get_all_email_ids(mail) result = {} for email_id, canc_info in tqdm(all_email_ids.items(), desc="Fetching cancellations", unit="email"): if email_id not in self.email_id_dict: email_result = self.get_cancellations_from_email( mail, email_id, canc_info) if email_result: self.email_id_dict[email_id] = email_result self.flush() else: continue order_to_cancelled_items = self.email_id_dict[email_id] for order_id in order_to_cancelled_items: if order_id not in result: result[order_id] = [] result[order_id].extend(order_to_cancelled_items[order_id]) return result @retry(stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=120)) def get_all_email_ids(self, mail) -> Dict[str, Tuple[CancFmt, CancQty]]: subject_searches = { ('Your Amazon.com order', 'has been canceled'): (CancFmt.IRRELEVANT, CancQty.NO), ('Your Amazon.com Order', 'Has Been Canceled'): (CancFmt.IRRELEVANT, CancQty.NO), ('Your Amazon.com Order', 'Has Been Cancelled'): (CancFmt.IRRELEVANT, CancQty.NO), ('Your AmazonSmile order', 'has been canceled'): (CancFmt.IRRELEVANT, CancQty.NO), ('Your AmazonSmile order', 'has been cancelled'): (CancFmt.IRRELEVANT, CancQty.NO), ('Item canceled for your Amazon.com order', ): (CancFmt.IRRELEVANT, CancQty.NO), ( "Successful cancellation of", "from your AmazonSmile order", ): (CancFmt.VOLUNTARY, CancQty.YES), ( "Successful cancellation of", "from your Amazon.com order", ): (CancFmt.VOLUNTARY, CancQty.YES), ("Partial item(s) cancellation from your Amazon.com order", ): (CancFmt.VOLUNTARY, CancQty.NO), ("item has been canceled from your AmazonSmile order", ): (CancFmt.INVOLUNTARY, CancQty.NO), ("items have been canceled from your AmazonSmile order", ): (CancFmt.INVOLUNTARY, CancQty.NO), ("items have been canceled from your Amazon.com order", ): (CancFmt.INVOLUNTARY, CancQty.NO), ("item has been canceled from your Amazon.com order", ): (CancFmt.INVOLUNTARY, CancQty.NO) } result_ids = dict() for search_terms, canc_info in subject_searches.items(): search_terms = [f'(SUBJECT "{phrase}")' for phrase in search_terms] status, response = mail.uid('SEARCH', None, *search_terms) email_ids = response[0].decode('utf-8') for email_id in email_ids.split(): result_ids[email_id] = canc_info return result_ids @retry(stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=120)) def get_cancellations_from_email( self, mail, email_id: str, canc_info: Tuple[CancFmt, CancQty]) -> Dict[str, List[str]]: try: result, data = mail.uid("FETCH", email_id, "(RFC822)") except Exception as e: raise Exception(f"Error retrieving email UID {email_id}") from e try: raw_email = data[0][1] orders = re.findall("(\d{3}-\d{7}-\d{7})", str(raw_email)) if not orders: return {} order = orders[0] cancelled_items = [] soup = BeautifulSoup(quopri.decodestring(raw_email), features="html.parser", from_encoding="iso-8859-1") if canc_info[0] == CancFmt.VOLUNTARY: cancelled_header = soup.find("h3", text="Canceled Items") elif canc_info[0] == CancFmt.INVOLUNTARY: cancelled_header = soup.find("span", text="Canceled Items") elif canc_info[0] == CancFmt.IRRELEVANT: return {order: []} else: raise Exception( f"Can't handle cancellation format {canc_info[0]}") parent = cancelled_header.parent.parent.parent cancelled_items = [] for li in parent.find_all('li'): # Each li contains a single link whose link text is the item name. canc_item = li.find('a').text.strip() # If cancellation email format contains quantity info, then use the string from # Amazon as-is, otherwise prepend with "??" to indicate indeterminate quantity. cancelled_items.append(canc_item if canc_info[1] == CancQty.YES else f"?? {canc_item}") return {order: cancelled_items} except Exception as e: msg = email.message_from_string(str(data[0][1], 'utf-8')) print( f"Received exception with message '{str(e)}' when processing cancellation email with subject {msg['Subject']}:" ) traceback.print_exc(file=sys.stdout) print("Continuing...") return None @retry(stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=120)) def load_mail(self): mail = email_auth.email_authentication() mail.select('"[Gmail]/All Mail"') return mail @retry(stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=120)) @debounce(2) def flush(self) -> None: self.retriever.flush(self.email_id_dict, CANCELLATIONS_FILENAME)