def import_table(self, name, file): data = csv.reader(file) name = self.get_table_name(name) if name in self.tables: return headers = [ ] cursor = None places = '' clips = 0 count = 0 for row in data: if not len(row): continue if str(row[0]).startswith('#'): continue if not headers: headers = [ each.strip() for each in row ] places = ','.join('?' * len(headers)) cursor = self.create_table(name, headers) continue if len(row) < len(headers): row.extend([''] * (len(headers)-len(row))) elif len(row) > len(headers): clips += 1 cursor.execute('insert into %s values (%s);' % (name, places), row[:len(headers)]) count += 1 return count
def scrape_ri_pages(): f = open('ri.csv', 'r') reader = ucsv.reader(f) for row in reader: scrape_ri_page(row[0]) time.sleep(3) f.close()
def scrape_links(read_csv=False, export_csv=False, update=True): """ gets all links from the pgda site for courses :param: export_csv: whether or not to export a csv file: pgda_courses.csv :returns: links, list of urls for courses """ links = [] if read_csv: #fetch the links from the csv file f = open('pgda_courses.csv', 'r') reader = ucsv.reader(f) for row in reader: links.append(row[3]) f.close() else: #fetch the links from the site start_r = requests.get('http://www.pdga.com/course_directory/country/us') start_soup = BeautifulSoup(start_r.text) last_link = start_soup.find('a', {'class':'pager-last active'}).attrs[0][1] last_page = int(re.search('page=[\d][\d]', last_link).group(0).split('=')[1]) if export_csv: f = open('pgda_courses.csv', 'wt') try: if export_csv: writer = ucsv.writer(f) for page in range(0, last_page+1): DIRECTORY_URL = 'http://www.pdga.com/course_directory/country/us' if page == 0: link = DIRECTORY_URL else: link = '%s&page=%s' % (DIRECTORY_URL, page) r = requests.get(link) soup = BeautifulSoup(r.text) for row in soup('table')[0].tbody('tr'): tds = row.findAll('td') course_link = "%s%s" % (BASE_URL, tds[0].a.attrs[0][1].split('?s=')[0]) name = tds[0].text city = tds[1].text province = tds[2].text if export_csv: writer.writerow( (name, city, province, course_link) ) links.append((name, course_link)) print('added %s' % course_link) finally: if export_csv: f.close() return links
def parse_rt_csv(rt_csv_filename): with open(rt_csv_filename, 'rb') as csv: reader = ucsv.reader(csv) for n, row in enumerate(reader): logging.info('Reading row %d' % n) if len(row) != 3: raise Exception( "Wrong number of fields in row {0}: {1}".format(n+1, row)) yield tuple(row[0:3])
def guess_dialect(filename): """tries to guess the dialect of csv files""" best = '' max_columns = 0 for dialect in csv.list_dialects(): file = open(filename, 'r') rd = ucsv.reader(file, dialect=dialect) header = rd.next() if len(header) > max_columns: max_columns = len(header) best = dialect file.close() return best
def convert(list_of_indexes, sheet_names): list_of_indexes = list_of_indexes listOfFiles = glob.glob("/data/in/tables/*.csv") sheet_names = sheet_names excelFile = xlsxwriter.Workbook('CE_WK' + '.xlsx') for index, fileInList in enumerate(listOfFiles): worksheet = excelFile.add_worksheet(str(sheet_names[index])) with open(fileInList, 'rb') as f: content = csv.reader(f) for index_row, data_in_row in enumerate(content): for index_col, data_in_cell in enumerate(data_in_row): if index_col in list_of_indexes[index]: if type(data_in_cell) == int or type( data_in_cell) == float: temp = '=TEXT(%d,\"*#*,######\")' % (data_in_cell) worksheet.write_formula(index_row, index_col, temp) elif data_in_cell == ' ': worksheet.write_blank(index_row, index_col, None) elif type(data_in_cell) == unicode: worksheet.write(index_row, index_col, unicode(data_in_cell)) else: worksheet.write(index_row, index_col, unicode(data_in_cell)) else: if data_in_cell == ' ': worksheet.write_blank(index_row, index_col, None) elif (type(data_in_cell) == int or type(data_in_cell) == float) and data_in_cell != ' ': worksheet.write_number(index_row, index_col, data_in_cell) else: worksheet.write(index_row, index_col, unicode(data_in_cell)) print " === conversion is done ===" excelFile.close()
nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') def cosine_sim(text1, text2): tfidf = vectorizer.fit_transform([text1, text2]) return ((tfidf * tfidf.T).A)[0, 1] # Define the webbrowser as Phantomjs and pretend it as Google Chrome. browser = defineBrowser() with open('Main_Study_Comments.csv', 'rb') as fr: reader = csv.reader(fr) with open('Main_Study_Comments_Abstracts.csv', 'wb') as fw: writer = csv.writer(fw) pubResultRow = [ "Firstname", "Lastname", "Specialization", "Affiliation", "location", "Phase 1", "Phase 2", "Phase 3", "High View", "Citation Benefit", "Private Comes First", "Acknowledgement", "Likely to Cite", "May Include Reference", "Might Refer to", "Likely to Cite + Acknowledgement", "May Include Reference + Acknowledgement", "Might Refer to + Acknowledgement", "Especially popular", "Highly visible", "Highly popular", "Manual Recommendation", "Track changed", "Inappropriate Comment", "Interested", "Withdrawal", "Email 1 Opened", "Email 2 Opened", "Email 3 Opened",
if row[1] not in meta_csv: meta_csv.append(row[1]) ''' #meta_query = db.query("SELECT * FROM Meta") meta_results = cursor.execute("SELECT * FROM Meta") for meta_result in cursor.fetchall(): meta.append( {'id' : int(meta_result[0]), 'name' : meta_result[1]}) genre_results = cursor.execute("SELECT * FROM Genre") for genre_result in cursor.fetchall(): genre.append( {'id' : int(genre_result[0]), 'name' : genre_result[1]}) with codecs.open('bdm.csv', 'rb') as f: reader = csv.reader(f) for row in reader: if row[0] == 'Style musical': continue genre_id = False for g in genre: if row[0] == g['name']: genre_id = g['id'] ''' #Add synonymous for row_syn in row[4].split(','): row_syn = row_syn.strip('?').strip() for g in genre: if row_syn.lower() == g['name'].lower(): syn_id = g['id']
def main(): # включаем парсер параметров parser = argparse.ArgumentParser(description='Convert price to ShopOS CSV import file.') # входной XML-файл (прайс) parser.add_argument('arguments', metavar='ARG', type=str, nargs=1, help='input XLS file') # включать описания в выходной CSV parser.add_argument('-d', dest='descr', action='store_true', help='enable description') # сравнивать прайс с export.csv parser.add_argument('-c', dest='compare', action='store_true', help='compare prices') # сливать export.csv с сервера parser.add_argument('-e', dest='download', action='store_true', help='download export.csv') # вычленять бренды parser.add_argument('-b', dest='brands', action='store_true', help='filter brands') # создать дополнительный -add.csv файл, в котором включены описания # (используется для прайса на оборудование) parser.add_argument('-f', dest='descfilter', action='store_true', help='filter descriptions (for tools)') args = parser.parse_args() # если указан -e if args.download: # сливаем с сайта export.csv download_export() # xls_input - входной XLS-прайс xls_input = args.arguments[0] debug_print('Reading XLS ' + xls_input) # перегоняем данные из входного прайса в table_input # table_input[категория] = [int(артикул), unicode(имя), unicode(описание), int(опт), int(розн), str(плюсики)] table_input = xlsread(xls_input) # имя выходного CSV-файла csv_output_fname = args.arguments[0][:-3] + 'csv' debug_print('Creating CSV ' + csv_output_fname) # создаем CSV-файл, разделитель "|" file_csv_output = open(csv_output_fname, 'wb') csv_output = csv.writer(file_csv_output, delimiter='|') # если указано, что нужно фильтровать оборудование, делаем второй CSV if args.descfilter: csv_output_fname_add = args.arguments[0][:-4] + '-add.csv' file_csv_output_add = open(csv_output_fname_add, 'wb') debug_print('Creating CSV ' + csv_output_fname_add) csv_output_add = csv.writer(file_csv_output_add, delimiter='|') # и говорим, что его нужно еще и сравнивать args.compare = True # если таки нужно сравнивать, if args.compare: # открываем любезно приготовленный export-out.csv # (экспортные данные с сайта) csv_compare = csv.reader(open('export-out.csv', 'rb'), delimiter='|') # таблица, в которой будут храниться все необходимые для сравнения данные о продуктах с сайта # table_compare[int(артикул)] = [int(опт), int(розница), показатель сниженной цены, описание] table_compare = {} for i,row in enumerate(csv_compare): if (i != 0) and (row[11][2:-5] != '') and (int(row[8]) != 0): table_compare[int(row[1])] = [int(row[11][2:-5]), int(row[8]), row[15], row[29]] # opt, rozn, ean, descr # объявляем таблицу выходных данных table_output = [] # прописываем заголовки CSV-таблиц if args.descfilter: table_output_add = [] table_output_add.append(['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl', 'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5']) if args.descr: header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl', 'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5'] else: header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl', 'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5'] table_output.append(header) debug_print('Starting parsing catalog') # начинаем парсить прайс по одной позиции # category - категория прайса, из которой берем позицию for category in table_input: # item - собственно позиция со всеми ее данными for item in table_input[category]: # ДЕФОЛТЫ {{ p_status = '1' # статус товара - всегда будет включен # дальнейшие дефолты значения не имеют p_sorting = '0' # сортировка p_shipping = '0' # доставка p_tpl = p_opttpl = 'default' # шаблоны p_fsk18 = p_tax = p_vpe = p_vpe_status = '0' p_priceNoTax_1 = p_priceNoTax_2 = p_url = '' p_weight = '0.00' p_disc = p_vpe_value = '0.0000' # }} //ДЕФОЛТЫ p_model = str(item[0]) # артикул p_stock = config.QTY[item[5]] # наличие p_name = item[1].replace('/', '/ ') # наименование if args.brands: [p_manufacturer, pricebrand] = brand(p_name, p_model) # производитель else: [p_manufacturer, pricebrand] = ['', ''] # короткое описание p_shortdesc = short_desc(category, item[2]) # шаблонистое SEO-описание - пока в разработке seo_description = seo_desc(p_name, category, brand(p_name, p_model)[1]) if seo_description == config.LEAVE_OLD_DESC: if args.compare: try: p_desc = table_compare[int(p_model)][3] except: debug_print_pause(table_compare[11928]) else: pass else: if p_shortdesc != '': p_desc = '<p>' + p_shortdesc + '</p><br/>' + seo_description # описание с ништяками else: p_desc = seo_description # описание с ништяками #p_desc = p_shortdesc # вместо этого заюзаем такую незапаристую конструкцию #p_desc = '' #if short_desc != '': # p_desc = '<p>' + p_shortdesc + '</p>' # фильтруем наличие товара/описания для товара-оборудования if args.descfilter: # по умолчанию добавляем товару описание из прайса is_add = True try: # если же на сайте уже есть описание, или же в прайсе его нет, if (table_compare[int(p_model)][3] != '') or (p_shortdesc == ''): # то ничего и не вставляем is_add = False except KeyError, e: pass # оптовая цена p_priceNoTax_3 = str(int(math.ceil(item[3]))) # розничная цена p_priceNoTax = str(int(math.ceil(item[4]))) # категоризируем p_cat = getcat(category, p_name) # мета [temp_name, p_meta_title, p_meta_desc, p_meta_key] = metatags(p_name, p_shortdesc, p_cat, p_model) if temp_name != '': p_name = temp_name # заниженная цена p_ean = '' if args.compare: p_ean = ean(p_model, p_priceNoTax_3, p_priceNoTax, table_compare) # сливаем все полученные данные в одну строку массива # и дампим ее в csv if args.descfilter: # если это прайс на оборудование, который надо раздербанивать на два, # то раздербаниваем, чо if is_add: table_output_add.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]) else: table_output.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]) else: # тут все остальные случаи if args.descr: row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]] else: row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]] table_output.append(row)
desired_capabilities=dcap, service_args=['--load-images=no']) # firefox_profile = webdriver.FirefoxProfile() # firefox_profile.set_preference('permissions.default.stylesheet', 2) # firefox_profile.set_preference('permissions.default.image', 2) # browser = webdriver.Firefox(firefox_profile) return browser RecommendationRepetition = {} WikipagesStats = {} with open('Main_Study_Comments.csv', 'rb') as fr: reader = csv.reader(fr) header = next(reader) for row in reader: row[34] = row[34].replace('http://', 'https://') row[34] = row[34].replace('https://en.wikipedia.org/?title=', 'https://en.wikipedia.org/wiki/') row[34] = row[34].replace('%28', '(') row[34] = row[34].replace('%29', ')') row[34] = row[34].replace('%27', "'") # row[34] = row[34].replace('%E2%80%93', "–") if row[34] in RecommendationRepetition: RecommendationRepetition[row[34]] += 1 else: RecommendationRepetition[row[34]] = 1
#coding: utf-8 import sys from selenium import webdriver from selenium.common.exceptions import TimeoutException, WebDriverException from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 import time, Tools, CTMConst import ucsv as csv from selenium.webdriver.common import keys f1 = open('payees.csv', 'rb') Payees = csv.reader(f1) FailedPayees = [] driver = webdriver.Ie() driver.implicitly_wait(5) Tools.WinAuthLogin(CTMConst.CTMUrl, driver) #------------ Field = driver.find_element_by_xpath("//input[@type='text'][@id='LoginPart__ctl5_txtUserName']") Field.send_keys("bioctest1") Field = driver.find_element_by_xpath("//input[@type='password']") Field.send_keys("Suicide1`") htmlElement = Tools.FindInFramesXP("//input[@value='Sign In'][@type='submit']", driver) if htmlElement != None: htmlElement.click() else: print "Error - Button Sign In is not found." try: driver.switch_to_alert().accept() time.sleep(4) #driver.get("http://ts-host-3/ctpm/StudySetup/StudyList.aspx") except(WebDriverException):
# along with MT/I. If not, see <http://www.gnu.org/licenses/>. # Please only use these spiders and their derivatives in accordance # with the terms of service and acceptable use policies of the data # providers. import csv, json, ucsv import lxml.etree countries = [] links = open('./country_links.csv', 'r') output = open('./countries.json', 'w') reader = ucsv.reader(links, csv.excel_tab) for row in reader: error = False country = {} country['name'] = row[0] file_name = 'countries/' + row[1][6:] + '.html' data = open(file_name).read() tree = lxml.etree.XML(data) xpath = "//table/tr/th/a[@title='Demonym']/../../td/a" try:
mainDatasetNamesList = [ 'Ideas_Repec_Dataset_Pilot2_Standard', 'Ideas_Repec_Dataset_Standard', 'Ideas_Repec_Dataset1_Standard', 'Ideas_Repec_Dataset3_Standard', 'Ideas_Repec_Dataset4_Standard', 'Ideas_Repec_Dataset10_Standard' ] usedDatasetNamesList = [ 'Ideas_Repec_Dataset_Pilot1_Used', 'Ideas_Repec_Dataset_Pilot2_Clean' ] outputDatasetName = 'Ideas_Repec_Dataset_Pilot3_Clean' for usedDatasetName in usedDatasetNamesList: with open(usedDatasetName + '.csv', 'rb') as frUsed: readerUsed = csv.reader(frUsed) headerUsed = next(readerUsed) for row in readerUsed: email = row[2] if not email in usedEmailsList: usedFirstNamesList.append(row[0]) usedLastNamesList.append(row[1]) usedEmailsList.append(email) print email, "entered." with open(outputDatasetName + '.csv', 'wb') as fw: writer = csv.writer(fw) resultRow = [
data = json.load(jsonFile) if data["parameters"]["#S3key"] == '' or data["parameters"][ "#S3secretKey"] == '' or data["parameters"]["bucketName"] == '': print " === config json parameters empty ===" sys.exit(1) if __name__ == '__main__': listOfFiles = glob.glob("/data/in/tables/*.csv") list_of_indexes = [[]] sheet_names = [] for fileName in listOfFiles: sheet_names.append(fileName[16:-4]) for index, fileInList in enumerate(listOfFiles): with open(fileInList, 'rb') as f: content = csv.reader(f) for index_row, data_in_row in enumerate(content): for index_col, data_in_cell in enumerate(data_in_row): if type(data_in_cell) == unicode and index_row != 0: list_of_indexes[index].append(index_col) list_of_indexes[index] = dict.fromkeys( list_of_indexes[index]).keys() list_of_indexes.append([]) convert(list_of_indexes, sheet_names) conn = tinys3.Connection(data["parameters"]["#S3key"], data["parameters"]["#S3secretKey"]) listOfFiles2 = glob.glob("/data/in/tables/*.xlsx") for file in listOfFiles2: temp = os.path.splitext(file[16:len(file)]) print " === uploading " + 'CE_WK' + str(int(time.strftime("%V")) -
#coding: utf-8 import sys from selenium import webdriver from selenium.common.exceptions import TimeoutException, WebDriverException from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 import time, Tools, CTMConst import ucsv as csv from selenium.webdriver.common import keys f1 = open('studies.csv', 'rb') Studies = csv.reader(f1) f1.close driver = webdriver.Ie() driver.implicitly_wait(5) Tools.WinAuthLogin(CTMConst.CTMUrl, driver) #------------ Field = driver.find_element_by_xpath("//input[@type='text'][@id='LoginPart__ctl5_txtUserName']") Field.send_keys("Viktor.klymenko") Field = driver.find_element_by_xpath("//input[@type='password']") Field.send_keys("Suicide1`") htmlElement = Tools.FindInFramesXPRec("//input[@value='Sign In'][@type='submit']", driver) if htmlElement != None: htmlElement.click() else: print "Error - Button Sign In is not found." #---------- for StudyNo in Studies: try: Tools.NavigateToScreen(CTMConst.StudyList, driver) inputElement = Tools.FindInFramesXPRec("//a[starts-with(text(), '3')]", driver)
# # You should have received a copy of the GNU General Public License # along with MT/I. If not, see <http://www.gnu.org/licenses/>. # Please only use these spiders and their derivatives in accordance # with the terms of service and acceptable use policies of the data # providers. import csv, gzip, ucsv, urllib2, time from StringIO import StringIO max = 0 input = open('./country_links.csv', 'r') reader = ucsv.reader(input, csv.excel_tab) count = 0 for row in reader: count += 1 url = 'http://en.wikipedia.org' + row[1] print url request = urllib2.Request( url, headers = {'User-Agent': 'manytopics/international'} ) response = urllib2.urlopen(request)
#coding: utf-8 import sys from selenium import webdriver from selenium.common.exceptions import TimeoutException, WebDriverException from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 import time, Tools, CTMConst import ucsv as csv from selenium.webdriver.common import keys f1 = open('payees.csv', 'rb') Payees = csv.reader(f1) FailedPayees = [] driver = webdriver.Ie() driver.implicitly_wait(5) Tools.WinAuthLogin(CTMConst.CTMUrl, driver) #------------ Field = driver.find_element_by_xpath( "//input[@type='text'][@id='LoginPart__ctl5_txtUserName']") Field.send_keys("bioctest1") Field = driver.find_element_by_xpath("//input[@type='password']") Field.send_keys("Suicide1`") htmlElement = Tools.FindInFramesXP("//input[@value='Sign In'][@type='submit']", driver) if htmlElement != None: htmlElement.click() else: print "Error - Button Sign In is not found." try: driver.switch_to_alert().accept() time.sleep(4)