Example #1
0
 def import_table(self, name, file):
     data = csv.reader(file)
     name = self.get_table_name(name)
     if name in self.tables:
         return
     headers = [ ]
     cursor = None
     places = ''
     clips = 0
     count = 0
     for row in data:
         if not len(row):
             continue
         if str(row[0]).startswith('#'):
             continue
         if not headers:
             headers = [ each.strip() for each in row ]
             places = ','.join('?' * len(headers))
             cursor = self.create_table(name, headers)
             continue
         if len(row) < len(headers):
             row.extend([''] * (len(headers)-len(row)))
         elif len(row) > len(headers):
             clips += 1
         cursor.execute('insert into %s values (%s);' % (name, places),
                        row[:len(headers)])
         count += 1
     return count
Example #2
0
def scrape_ri_pages():
    f = open('ri.csv', 'r')
    reader = ucsv.reader(f)

    for row in reader:
        scrape_ri_page(row[0])
        time.sleep(3)
    f.close()
Example #3
0
def scrape_links(read_csv=False, export_csv=False, update=True):
    """
    gets all links from the pgda site for courses

    :param: export_csv: whether or not to export a csv file: pgda_courses.csv

    :returns: links, list of urls for courses
    """
    links = []

    if read_csv:
        #fetch the links from the csv file
        f = open('pgda_courses.csv', 'r')
        reader = ucsv.reader(f)
        for row in reader:
            links.append(row[3])
        f.close()
    else:
        #fetch the links from the site
        start_r = requests.get('http://www.pdga.com/course_directory/country/us')
        start_soup = BeautifulSoup(start_r.text)

        last_link = start_soup.find('a', {'class':'pager-last active'}).attrs[0][1]
        last_page = int(re.search('page=[\d][\d]', last_link).group(0).split('=')[1])

        if export_csv:
            f = open('pgda_courses.csv', 'wt')

        try:
            if export_csv:
                writer = ucsv.writer(f)

            for page in range(0, last_page+1):
                DIRECTORY_URL = 'http://www.pdga.com/course_directory/country/us'
                if page == 0:
                    link = DIRECTORY_URL
                else:
                    link = '%s&page=%s' % (DIRECTORY_URL, page)

                r = requests.get(link)
                soup = BeautifulSoup(r.text)
                for row in soup('table')[0].tbody('tr'):
                    tds = row.findAll('td')

                    course_link = "%s%s" % (BASE_URL, tds[0].a.attrs[0][1].split('?s=')[0])
                    name = tds[0].text
                    city = tds[1].text
                    province = tds[2].text

                    if export_csv:
                        writer.writerow( (name, city, province, course_link) )
                    links.append((name, course_link))

                    print('added %s' % course_link)
        finally:
            if export_csv:
                f.close()
    return links
Example #4
0
def parse_rt_csv(rt_csv_filename):
    with open(rt_csv_filename, 'rb') as csv:
        reader = ucsv.reader(csv)
        for n, row in enumerate(reader):
            logging.info('Reading row %d' % n)
            if len(row) != 3:
                raise Exception(
                "Wrong number of fields in row {0}: {1}".format(n+1, row))
            yield tuple(row[0:3])
Example #5
0
def guess_dialect(filename):
	"""tries to guess the dialect of csv files"""
	best = ''
	max_columns = 0
	for dialect in csv.list_dialects():
		file = open(filename, 'r')
		rd = ucsv.reader(file, dialect=dialect)
		header = rd.next()
		if len(header) > max_columns:
			max_columns = len(header)
			best = dialect
		file.close()
	return best
Example #6
0
#coding: utf-8
import sys
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
import time, Tools, CTMConst
import ucsv as csv
from selenium.webdriver.common import keys
f1 = open('studies.csv', 'rb')
Studies = csv.reader(f1)
f1.close
driver = webdriver.Ie()
driver.implicitly_wait(5)
Tools.WinAuthLogin(CTMConst.CTMUrl, driver)
#------------
Field = driver.find_element_by_xpath("//input[@type='text'][@id='LoginPart__ctl5_txtUserName']")
Field.send_keys("Viktor.klymenko")
Field = driver.find_element_by_xpath("//input[@type='password']")
Field.send_keys("Suicide1`")

htmlElement = Tools.FindInFramesXPRec("//input[@value='Sign In'][@type='submit']", driver)
if htmlElement != None:
    htmlElement.click()
else:
    print "Error - Button Sign In is not found."

#----------
for StudyNo in Studies:
    try:
        Tools.NavigateToScreen(CTMConst.StudyList, driver)
        inputElement = Tools.FindInFramesXPRec("//a[starts-with(text(), '3')]", driver)
Example #7
0
        if row[1] not in meta_csv:
            meta_csv.append(row[1])
'''

#meta_query = db.query("SELECT * FROM Meta")
meta_results = cursor.execute("SELECT * FROM Meta")
for meta_result in cursor.fetchall():
    meta.append( {'id' : int(meta_result[0]), 'name' : meta_result[1]})

genre_results = cursor.execute("SELECT * FROM Genre")
for genre_result in cursor.fetchall():
    genre.append( {'id' : int(genre_result[0]), 'name' : genre_result[1]})


with codecs.open('bdm.csv', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        if row[0] == 'Style musical':
            continue
        genre_id = False
        for g in genre:
           if row[0] == g['name']:
              genre_id =  g['id']

'''
        #Add synonymous
        for row_syn in row[4].split(','):
           row_syn = row_syn.strip('?').strip()
           for g in genre:
              if row_syn.lower() == g['name'].lower():
                 syn_id =  g['id']
Example #8
0
def main():
    # включаем парсер параметров
    parser = argparse.ArgumentParser(description='Convert price to ShopOS CSV import file.')

    # входной XML-файл (прайс)
    parser.add_argument('arguments', metavar='ARG', type=str, nargs=1, help='input XLS file')

    # включать описания в выходной CSV
    parser.add_argument('-d', dest='descr', action='store_true', help='enable description')

    # сравнивать прайс с export.csv
    parser.add_argument('-c', dest='compare', action='store_true', help='compare prices')

    # сливать export.csv с сервера
    parser.add_argument('-e', dest='download', action='store_true', help='download export.csv')

    # вычленять бренды
    parser.add_argument('-b', dest='brands', action='store_true', help='filter brands')

    # создать дополнительный -add.csv файл, в котором включены описания
    # (используется для прайса на оборудование)
    parser.add_argument('-f', dest='descfilter', action='store_true', help='filter descriptions (for tools)')
    args = parser.parse_args()

    # если указан -e
    if args.download:
        # сливаем с сайта export.csv
        download_export()

    # xls_input - входной XLS-прайс
    xls_input = args.arguments[0]

    debug_print('Reading XLS ' + xls_input)
    
    # перегоняем данные из входного прайса в table_input
    # table_input[категория] = [int(артикул), unicode(имя), unicode(описание), int(опт), int(розн), str(плюсики)]
    table_input = xlsread(xls_input)

    # имя выходного CSV-файла
    csv_output_fname = args.arguments[0][:-3] + 'csv'

    debug_print('Creating CSV ' + csv_output_fname)

    # создаем CSV-файл, разделитель "|"
    file_csv_output = open(csv_output_fname, 'wb')
    csv_output = csv.writer(file_csv_output, delimiter='|')

    # если указано, что нужно фильтровать оборудование, делаем второй CSV
    if args.descfilter:
        csv_output_fname_add = args.arguments[0][:-4] + '-add.csv'
        file_csv_output_add = open(csv_output_fname_add, 'wb')

        debug_print('Creating CSV ' + csv_output_fname_add)
        csv_output_add = csv.writer(file_csv_output_add, delimiter='|')

        # и говорим, что его нужно еще и сравнивать
        args.compare = True

    # если таки нужно сравнивать,
    if args.compare:
        # открываем любезно приготовленный export-out.csv
        # (экспортные данные с сайта)
        csv_compare = csv.reader(open('export-out.csv', 'rb'), delimiter='|')

        # таблица, в которой будут храниться все необходимые для сравнения данные о продуктах с сайта
        # table_compare[int(артикул)] = [int(опт), int(розница), показатель сниженной цены, описание]
        table_compare = {}
        for i,row in enumerate(csv_compare):
            if (i != 0) and (row[11][2:-5] != '') and (int(row[8]) != 0):
                table_compare[int(row[1])] = [int(row[11][2:-5]), int(row[8]), row[15], row[29]]
                                            # opt,                rozn,        ean,     descr

    # объявляем таблицу выходных данных
    table_output = []

    # прописываем заголовки CSV-таблиц
    if args.descfilter:
        table_output_add = []
        table_output_add.append(['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl',
                  'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1',
                  'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight',
                  'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value',
                  'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru',
                  'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3',
                  'p_cat.4', 'p_cat.5'])

    if args.descr:
        header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl',
                  'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1',
                  'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight',
                  'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value',
                  'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru',
                  'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3',
                  'p_cat.4', 'p_cat.5']
    else:
        header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl',
                  'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2',
                  'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe',
                  'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru',
                  'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5']
    table_output.append(header)

    debug_print('Starting parsing catalog')

    # начинаем парсить прайс по одной позиции
    # category - категория прайса, из которой берем позицию
    for category in table_input:
        # item - собственно позиция со всеми ее данными
        for item in table_input[category]:
            # ДЕФОЛТЫ {{
            p_status = '1' # статус товара - всегда будет включен

            # дальнейшие дефолты значения не имеют
            p_sorting = '0' # сортировка
            p_shipping = '0' # доставка
            p_tpl = p_opttpl = 'default' # шаблоны
            p_fsk18 = p_tax = p_vpe = p_vpe_status = '0'
            p_priceNoTax_1 = p_priceNoTax_2 = p_url = ''
            p_weight = '0.00'
            p_disc = p_vpe_value = '0.0000'
            # }} //ДЕФОЛТЫ

            p_model = str(item[0]) # артикул
            p_stock = config.QTY[item[5]] # наличие
            p_name = item[1].replace('/', '/ ') # наименование

            if args.brands:
                [p_manufacturer, pricebrand] = brand(p_name, p_model) # производитель
            else:
                [p_manufacturer, pricebrand] = ['', '']

            # короткое описание
            p_shortdesc = short_desc(category, item[2])
            # шаблонистое SEO-описание - пока в разработке
            
            seo_description = seo_desc(p_name, category, brand(p_name, p_model)[1])

            if seo_description == config.LEAVE_OLD_DESC:
                if args.compare:
                    try:
                        p_desc = table_compare[int(p_model)][3]
                    except:
                        debug_print_pause(table_compare[11928])

                else:
                    pass
            else:
                if p_shortdesc != '':
                    p_desc = '<p>' + p_shortdesc + '</p><br/>' + seo_description # описание с ништяками
                else:
                    p_desc = seo_description # описание с ништяками
                #p_desc = p_shortdesc
        

            # вместо этого заюзаем такую незапаристую конструкцию
            #p_desc = ''
            #if short_desc != '':
            #    p_desc = '<p>' + p_shortdesc + '</p>'

            # фильтруем наличие товара/описания для товара-оборудования
            if args.descfilter:
                # по умолчанию добавляем товару описание из прайса
                is_add = True

                try:
                    # если же на сайте уже есть описание, или же в прайсе его нет,
                    if (table_compare[int(p_model)][3] != '') or (p_shortdesc == ''):
                        # то ничего и не вставляем
                        is_add = False
                except KeyError, e:
                    pass

            # оптовая цена
            p_priceNoTax_3 = str(int(math.ceil(item[3])))
            # розничная цена
            p_priceNoTax = str(int(math.ceil(item[4])))

            # категоризируем
            p_cat = getcat(category, p_name)

            # мета
            [temp_name, p_meta_title, p_meta_desc, p_meta_key] = metatags(p_name, p_shortdesc, p_cat, p_model)
            if temp_name != '':
                p_name = temp_name

            # заниженная цена
            p_ean = ''
            if args.compare:
                p_ean = ean(p_model, p_priceNoTax_3, p_priceNoTax, table_compare)

            # сливаем все полученные данные в одну строку массива
            # и дампим ее в csv
            if args.descfilter:
                # если это прайс на оборудование, который надо раздербанивать на два,
                # то раздербаниваем, чо
                if is_add:
                    table_output_add.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]])
                else:
                    table_output.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]])
            else:
                # тут все остальные случаи
                if args.descr:
                    row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]
                else:
                    row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]

                table_output.append(row)
Example #9
0
#coding: utf-8
import sys
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
import time, Tools, CTMConst
import ucsv as csv
from selenium.webdriver.common import keys
f1 = open('payees.csv', 'rb')
Payees = csv.reader(f1)
FailedPayees = []
driver = webdriver.Ie()
driver.implicitly_wait(5)
Tools.WinAuthLogin(CTMConst.CTMUrl, driver)
#------------
Field = driver.find_element_by_xpath("//input[@type='text'][@id='LoginPart__ctl5_txtUserName']")
Field.send_keys("bioctest1")
Field = driver.find_element_by_xpath("//input[@type='password']")
Field.send_keys("Suicide1`")

htmlElement = Tools.FindInFramesXP("//input[@value='Sign In'][@type='submit']", driver)
if htmlElement != None:
    htmlElement.click()
else:
    print "Error - Button Sign In is not found."

try:
        driver.switch_to_alert().accept()
        time.sleep(4)
        #driver.get("http://ts-host-3/ctpm/StudySetup/StudyList.aspx")
except(WebDriverException):
Example #10
0
#
# You should have received a copy of the GNU General Public License
# along with MT/I.  If not, see <http://www.gnu.org/licenses/>.

# Please only use these spiders and their derivatives in accordance
# with the terms of service and acceptable use policies of the data
# providers.


import csv, gzip, ucsv, urllib2, time
from StringIO import StringIO

max = 0

input = open('./country_links.csv', 'r')
reader = ucsv.reader(input, csv.excel_tab)

count = 0

for row in reader:
	count += 1

	url = 'http://en.wikipedia.org' + row[1]
	print url

	request = urllib2.Request(
	 	url, headers = {'User-Agent': 'manytopics/international'}
	)

	response = urllib2.urlopen(request)
Example #11
0
# along with MT/I.  If not, see <http://www.gnu.org/licenses/>.

# Please only use these spiders and their derivatives in accordance
# with the terms of service and acceptable use policies of the data
# providers.


import csv, json, ucsv
import lxml.etree

countries = []

links = open('./country_links.csv', 'r')
output = open('./countries.json', 'w')

reader = ucsv.reader(links, csv.excel_tab)

for row in reader:
	error = False

	country = {}
	country['name'] = row[0]

	file_name = 'countries/' + row[1][6:] + '.html'
	data = open(file_name).read()

	tree = lxml.etree.XML(data)

	xpath = "//table/tr/th/a[@title='Demonym']/../../td/a"

	try: