def main(): try: file_path = sys.argv[1] csv_obj = CSV(file_path) stock_list = csv_obj.stock_list resp = 'y' while resp.lower() == 'y': try: stock_name = input( "Welcome Agent! Which stock you need to process? ") is_exact_match, prefixed_words = csv_obj.t.list_words( stock_name) stock_name = input_stock_name(prefixed_words, stock_name, is_exact_match) if stock_list.__contains__(stock_name): print('Processing for ' + stock_name) start_date = parse_date( input("From which date you want to start ")) end_date = parse_date( input("Till which date you want to analyze ")) selected_stocks = select_stocks(stock_list[stock_name], start_date, end_date) mean_value, std = calculate_mean_std(selected_stocks) buy_date, sell_date, profit = find_profit(selected_stocks) print("Here is you result " + "Mean: " + str(mean_value) + ", Std: " + str(std) + ", Buy date: " + str(buy_date) + ", Sell date: " + str(sell_date) + ", Profit: Rs. " + str(profit)) else: print("stock not found") except Exception as e: print("some error Occurred", e) resp = input("Do you want to continue? (y or n) ") except: print("could not process file")
def get_issue_date(issue_id): issue_url = '%s%s' % (ISSUE_URL, issue_id) response = get_url(issue_url) page = BeautifulSoup(response.read()) issue_date = page.find('div', 'issue').strong.string issue_datetime = parse_date(issue_date) return issue_datetime
def check_csv(file_name, year=None, exclude=[]): ''' Check for missing editorials year: check every day in the specified year exclude: list of days to exclude from checking (eg. to exclude Sunday [6]) ''' articles = csv.reader(open(file_name, 'rb'), delimiter=',', quotechar='"') article_dates = [] missing_dates = [] for article in articles: # Get the date date = parse_date(article[6]) article_dates.append(date) article_dates.sort() duplicates = find_duplicates(article_dates) print 'Duplicates: %s' % len(duplicates) if year: start_date = datetime.date(year, 1, 1) end_date = datetime.date(year, 12, 31) else: start_date = article_dates[0] end_date = article_dates[-1] one_day = datetime.timedelta(days=1) this_day = start_date # Loop through each day in specified period to see if there's an article # If not, add to the missing_dates list. while this_day <= end_date: if this_day.weekday() not in exclude: #exclude Sunday if this_day not in article_dates: missing_dates.append(this_day) this_day += one_day print 'Missing: %s' % len(missing_dates) csv_out = csv.DictWriter(open(file_name, 'ab'), extrasaction='ignore', fieldnames=[ 'id', 'title', 'url', 'newspaper_title', 'newspaper_details', 'newspaper_id', 'issue_date', 'page', 'page_url', 'corrections', 'ftext' ], dialect=csv.excel) # Write a results file with nicely formatted dates with open(os.path.join(os.path.dirname(file_name), 'csv_check.html'), 'wb') as results: results.write( '<html>\n<head>\n <title>Results</title>\n</head>\n<body>') results.write('<h2>Duplicates:</h2>\n<table>\n') for dup in duplicates: results.write( '<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(dup), get_issue_url(dup, '35'))) results.write('</table>\n') results.write('<h2>Missing:</h2>\n<table>\n') for missing in missing_dates: results.write( '<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(missing), get_issue_url(missing, '35'))) csv_out.writerow({'issue_date': format_date(missing)}) results.write('</table>\n') results.write('</body>\n</html>')
def test_parse_date(self): cases = [ ('2 June 1884', {'date': datetime.datetime(1884, 6, 2), 'day': True, 'month': True}), ('03 Jul 1921', {'date': datetime.datetime(1921, 7, 3), 'day': True, 'month': True}), ('13 Jul. 1921', {'date': datetime.datetime(1921, 7, 13), 'day': True, 'month': True}), ('Dec 1778', {'date': datetime.datetime(1778, 12, 1), 'day': False, 'month': True}), ('1962', {'date': datetime.datetime(1962, 1, 1), 'day': False, 'month': False}), ] for case in cases: self.assertEqual(utilities.parse_date(case[0]), case[1])
def check_csv(file_name, year=None, exclude=[]): ''' Check for missing editorials year: check every day in the specified year exclude: list of days to exclude from checking (eg. to exclude Sunday [6]) ''' articles = csv.reader(open(file_name, 'rb'), delimiter=',', quotechar='"') article_dates = [] missing_dates = [] for article in articles: # Get the date date = parse_date(article[6]) article_dates.append(date) article_dates.sort() duplicates = find_duplicates(article_dates) print 'Duplicates: %s' % len(duplicates) if year: start_date = datetime.date(year, 1, 1) end_date = datetime.date(year, 12, 31) else: start_date = article_dates[0] end_date = article_dates[-1] one_day = datetime.timedelta(days=1) this_day = start_date # Loop through each day in specified period to see if there's an article # If not, add to the missing_dates list. while this_day <= end_date: if this_day.weekday() not in exclude: #exclude Sunday if this_day not in article_dates: missing_dates.append(this_day) this_day += one_day print 'Missing: %s' % len(missing_dates) csv_out = csv.DictWriter(open(file_name, 'ab'), extrasaction='ignore', fieldnames=['id', 'title', 'url', 'newspaper_title', 'newspaper_details', 'newspaper_id', 'issue_date', 'page', 'page_url','corrections','ftext'], dialect=csv.excel) # Write a results file with nicely formatted dates with open(os.path.join(os.path.dirname(file_name), 'csv_check.html'), 'wb') as results: results.write('<html>\n<head>\n <title>Results</title>\n</head>\n<body>') results.write('<h2>Duplicates:</h2>\n<table>\n') for dup in duplicates: results.write('<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(dup), get_issue_url(dup, '35'))) results.write('</table>\n') results.write('<h2>Missing:</h2>\n<table>\n') for missing in missing_dates: results.write('<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(missing), get_issue_url(missing, '35'))) csv_out.writerow({'issue_date': format_date(missing)}) results.write('</table>\n') results.write('</body>\n</html>')
def __init__(self, stock_date, price): self.stock_date = parse_date(stock_date) self.price = float(price)
def check_editorial(csv_file, year): print csv_file articles = csv.reader(open(csv_file, 'rb'), delimiter=',', quotechar='"') # Things to check: # Duplicate dates # Number of words # Page Number # Missing -- article for each issue title_id = None title_name = None article_count = 0 article_dates = [] articles_filtered = [] odd_pages = [] short_articles = [] long_articles = [] missing_dates = [] for article in articles: article_count += 1 if not title_id: title_id = article[5] if not title_name: title_name = article[3] # Get the date try: page_number = int(article[7]) except ValueError: page_number = int(re.search(r'(\d+)', article[7]).group(1)) article_url = article[2] article_title = article[1] article_date = parse_date(article[6]) text = article[10] word_count = len(text.split()) # Check if page number is odd or even if page_number % 2: odd_pages.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) else: if word_count < 100: short_articles.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) elif word_count > 1500: long_articles.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) else: article_dates.append(article_date.isoformat()) articles_filtered.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) # Return duplicates duplicate_dates = sorted(set(find_duplicates(article_dates))) # Remove duplicates article_dates = sorted(set(article_dates)) issues = get_title_issues(title_id, year) issue_dates = [issue['date'] for issue in issues] for issue_date in issue_dates: if not issue_date in article_dates: missing_dates.append(issue_date) print 'Total articles: %s' % article_count print 'Odd pages: %s' % len(odd_pages) print 'Short articles: %s' % len(short_articles) print 'Long articles: %s' % len(long_articles) print 'Duplicate dates: %s' % len(duplicate_dates) print 'Found articles: %s' % len(articles_filtered) print 'Missing articles: %s' % len(missing_dates) results = {'id': title_id, 'name': title_name, 'odd': odd_pages, 'short': short_articles, 'long': long_articles, 'articles': articles_filtered, 'duplicates': duplicate_dates, 'missing': missing_dates, 'issues': len(issues) } json_file = '%s/json/%s.js' % (NMA_FOLDER, os.path.basename(csv_file)[:-4]) print json_file with open(json_file, 'wb') as json_data: json.dump(results, json_data, indent=2) return results
def check_editorial(csv_file, year): print csv_file articles = csv.reader(open(csv_file, 'rb'), delimiter=',', quotechar='"') # Things to check: # Duplicate dates # Number of words # Page Number # Missing -- article for each issue title_id = None title_name = None article_count = 0 article_dates = [] articles_filtered = [] odd_pages = [] short_articles = [] long_articles = [] missing_dates = [] for article in articles: article_count += 1 if not title_id: title_id = article[5] if not title_name: title_name = article[3] # Get the date try: page_number = int(article[7]) except ValueError: page_number = int(re.search(r'(\d+)', article[7]).group(1)) article_url = article[2] article_title = article[1] article_date = parse_date(article[6]) text = article[10] word_count = len(text.split()) # Check if page number is odd or even if page_number % 2: odd_pages.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) else: if word_count < 100: short_articles.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) elif word_count > 1500: long_articles.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) else: article_dates.append(article_date.isoformat()) articles_filtered.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) # Return duplicates duplicate_dates = sorted(set(find_duplicates(article_dates))) # Remove duplicates article_dates = sorted(set(article_dates)) issues = get_title_issues(title_id, year) issue_dates = [issue['date'] for issue in issues] for issue_date in issue_dates: if not issue_date in article_dates: missing_dates.append(issue_date) print 'Total articles: %s' % article_count print 'Odd pages: %s' % len(odd_pages) print 'Short articles: %s' % len(short_articles) print 'Long articles: %s' % len(long_articles) print 'Duplicate dates: %s' % len(duplicate_dates) print 'Found articles: %s' % len(articles_filtered) print 'Missing articles: %s' % len(missing_dates) results = { 'id': title_id, 'name': title_name, 'odd': odd_pages, 'short': short_articles, 'long': long_articles, 'articles': articles_filtered, 'duplicates': duplicate_dates, 'missing': missing_dates, 'issues': len(issues) } json_file = '%s/json/%s.js' % (NMA_FOLDER, os.path.basename(csv_file)[:-4]) print json_file with open(json_file, 'wb') as json_data: json.dump(results, json_data, indent=2) return results