Ejemplo n.º 1
0
def process(filt):
    filt = int(filt)
    f_okato = open("res/okato_codes.csv", 'rb')
    csvreader = csv.DictReader(f_okato)
    for row in csvreader:
        if filt == row['OKATO1'] or filt == row['OKATO2'] or filt == row[
                'OKATO3'] or filt == 0:

            if str(row['OKATO3']) != '':
                final = "http://112.ru/publish/00/00/nearOrg/mvd/" + str(
                    row['OKATO1']) + "/f" + str(row['OKATO2']) + "/" + str(
                        row['OKATO3'])
            else:
                final = "http://112.ru/publish/00/00/nearOrg/mvd/" + str(
                    row['OKATO1']) + "/f" + str(row['OKATO2'])
            try:
                res = urllib2.urlopen(final + ".shtml")
                parse_org(final, str(row['OKATO2']))
            except urllib2.URLError, e:
                #import pdb;pdb.set_trace()
                get_photo_status = False
                if hasattr(e, 'reason'):
                    print 'We failed to reach a server.'
                    print 'Reason: ', e.reason
                elif hasattr(e, 'code'):
                    print 'The server couldn\'t fulfill the request.'
                    print 'Error code: ' + str(
                        e.code) + " Page: " + final + ".shtml"
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        file_name = args[0]
        f = open(file_name, 'rb')
        d = csv.DictReader(f)
        for row in d:
            username = row['username']

            if User.objects.filter(username=username).exists():
                print 'User %s exists.' % (username)
            else:
                first_name = row.get('first_name', '')
                last_name = row.get('last_name', '')
                email = row.get('email', '')
                locality = row.get('locality', '')
                gender = row.get('gender', '')
                password = row.get('password', '')

                user = User(
                    username=username,
                    email=email,
                    first_name=first_name,
                    last_name=last_name,
                )

                user.set_password(password)
                user.save()

                user.profile.gender = gender
                try:
                    user.profile.locality = Entity.objects.get(id=locality)
                except ObjectDoesNotExist:
                    print 'user %s locality id %s does not exist' % (username,
                                                                     locality)
                user.profile.save()
Ejemplo n.º 3
0
def GetOrgnrAndKommunIDForGroup(groupname):
	groupname = groupname.lower()
	with open('data/kommunid.csv', 'rb') as f:
		reader = ucsv.DictReader(f, delimiter=',', quoting=ucsv.QUOTE_ALL, fieldnames=['namn', 'id', 'orgnr'])
		for row in reader:
			name = row['namn'].lower().strip("\"")
			if name == groupname:
				return str(row['id']), str(row['orgnr'])
	return "", ""
Ejemplo n.º 4
0
    def handle(self, *args, **options):
        file_name = args[0]
        f = open(file_name, 'rb')
        d = csv.DictReader(f)
        site = Site.objects.get(pk=settings.SITE_ID)
        for row in d:
            names = row[u'שם'].split(' ')
            email = row[u'דואר אלקטרוני']
            user = invite_user(
                username=email,
                email=email,
                first_name=names[0],
                last_name=' '.join(names[1:]),
                site=site,
            )
            if user.is_active:
                self.stdout.write('%s is already active, no invitation sent' %
                                  email)
            else:
                ''' send an invitation email '''
                reg_profile = user.registrationprofile_set.all()[0]
                ctx_dict = {
                    'invitation_key': reg_profile.activation_key,
                    'expiration_days': settings.ACCOUNT_ACTIVATION_DAYS,
                    'site': site
                }
                subject = render_to_string('user/invitation_email_subject.txt',
                                           ctx_dict).rstrip()
                # Email subject *must not* contain newlines
                html_content = render_to_string('user/invitation_email.html',
                                                ctx_dict)
                text_content = '\n'.join(
                    (strip_tags(html_content), "http://%s%s" %
                     (site.domain,
                      reverse("accept-invitation",
                              args=(ctx_dict['invitation_key'], )))))

                # create the email, and attach the HTML version as well.
                msg = EmailMultiAlternatives(subject, text_content,
                                             settings.DEFAULT_FROM_EMAIL,
                                             [email])
                msg.attach_alternative(html_content, "text/html")
                msg.send()
Ejemplo n.º 5
0
def main(argv):
    (inputfile, outputfile) = parse_input(argv)

    phrases = []
    cleaned_file = ''
    try:
        cleaned_file = sanitize_file(inputfile)
        line_number = 1
        # Here we don't need the codecs.open as we use ucsv to read the file
        with open(cleaned_file, 'rb') as csvfile:
            for row in ucsv.DictReader(csvfile):
                line_number += 1
                pinyin_phrase = row['Pronunciation']
                try:
                    annotated_pinyin = annotate_phrase(pinyin_phrase)
                except ValueError:
                    print "There's a fishy pronunciation entry on line %d." % line_number
                    continue
                sort_value = calc_sort_value(annotated_pinyin[::-1], 1, 0)
                (first_syllable, _tone) = annotated_pinyin[0]

                hanzi_phrase = row['Word']
                phrases.append((sort_value, first_syllable,
                                hanzi_phrase, pinyin_phrase))
        if cleaned_file.endswith('sanitized.csv'):
            os.remove(cleaned_file)
    except IOError:
        if cleaned_file.endswith('sanitized.csv'):
            os.remove(cleaned_file)
        print 'Bad input file: ', inputfile 

    sorted_phrases = sorted(phrases, key = itemgetter(0, 1))

    output_ready_phrases = [phrase[2:4] for phrase in sorted_phrases]
    
    with open(outputfile, 'wb') as f:
        writer = ucsv.writer(f)
        writer.writerow(['Word', 'Pronunciation'])
        writer.writerows(output_ready_phrases)
Ejemplo n.º 6
0
def load(secure,hostname,url,schema,table,verbose):
  show("begin "+hostname+" "+url+" "+schema+" "+table)
  if secure:
    address = "https://"+hostname+url
  else:
    address = "http://"+hostname+url
  
  #""" load from web
  show("load from "+address)
  try:
    response = requests.get(address)
  except e:
    show('HTTP GET failed.')
    show('Reason: %s'%(e.reason))
    sys.exit(2)
  else:
    # everything is fine
    show("api call OK")
  
  # read the data.
  # all of it. this is dangerous for big datasets!
  # convert to utf-8 on-the-fly if it's not
  data = response.text.encode('utf-8')
  #"""

  # create temporary file (remove at the end)
  f = tempfile.NamedTemporaryFile() #defaults: mode='w+b', delete=True)
  show("using tempfile: %s"%(f.name))
  f.write(data)
  
  # start using data, go to start
  f.seek(0)
  # remove BOM if exists
  if f.read(3)!=codecs.BOM_UTF8: f.seek(0)
  
  # make csv dictionary (first row must have column names)
  csvdata = csv.DictReader(f, delimiter=";")
  
  # discover columns and their types (read through entirely!)
  show("discover table structure")
  cnt=0
  for row in csvdata:
    cnt+=1
    if verbose: print cnt,row
    if verbose:
      for col in row:
        print cnt,col,row[col]
    dboperator.columns(row)
  
  # start operating with database
  # drop table
  show("drop %s.%s"%(schema,table))
  dboperator.drop(schema,table)

  # create table
  show("create %s.%s"%(schema,table))
  dboperator.create(schema,table)

  show("insert data")
  # reset csvdata!
  f.seek(0)
  # remove BOM
  if f.read(3)!=codecs.BOM_UTF8: f.seek(0)
  csvdata = csv.DictReader(f, delimiter=";")
  
  cnt=0
  for row in csvdata:
    cnt+=1
    # show some sign of being alive
    if cnt%100 == 0:
      sys.stdout.write('.')
      sys.stdout.flush()
    if cnt%1000 == 0:
      show("-- %d" % (cnt))
    dboperator.insert(address,schema,table,row)

  show("wrote %d"%(cnt))
  dboperator.close()
  
  # close (and delete) file
  f.close()

  show("ready")
Ejemplo n.º 7
0
class ParseXML:
    ##########################READ CSV###################
    #Read CSV file containing the right tags to produce
    dictReader = csv.DictReader(open('awol_title_strings.csv', 'rb'),
                                fieldnames=['titles', 'tags'],
                                delimiter=',',
                                quotechar='"')
    #Build a dictionary from the CSV file-> {<string>:<tags to produce>}
    titleStringsDict = dict()
    for row in dictReader:
        titleStringsDict.update({row['titles']: row['tags']})

    #Read awol_colon_prefixes.csv file and build a dictionary
    dictReader2 = csv.DictReader(
        open('awol_colon_prefixes.csv', 'rb'),
        fieldnames=['col_pre', 'omit_post', 'strip_title', 'mul_res'],
        delimiter=',',
        quotechar='"')
    colPrefDict = dict()
    #Build a dictionary of format {<column prefix>:<list of cols 2,3 and 4>}
    for row in dictReader2:
        colPrefDict.update({
            row['col_pre']:
            [row['omit_post'], row['strip_title'], row['mul_res']]
        })

    #Read content-disposition.csv file and build a dictionary
    dictReader3 = csv.DictReader(open('content-disposition.csv', 'rb'),
                                 fieldnames=[
                                     'title', 'title_normalized', 'colonfix',
                                     'single_resource', 'ignore', 'checked',
                                     'multiple_resource', 'url', 'id'
                                 ],
                                 delimiter=',',
                                 quotechar='"')
    contDispDict = dict()
    #Build a dictionary of format {<id>:[<list of rest of the columns>]}
    for row in dictReader3:
        if row['single_resource'] == 'true':
            contDispDict.update({
                row['id']: [
                    row['title'], row['title_normalized'], row['colonfix'],
                    row['single_resource'], row['ignore'], row['checked'],
                    row['multiple_resource'], row['url']
                ]
            })

    #############END OF READ CSV#########################
    #Check if multiple tags separated by ',' exist in the titleStringsDict[tag]
    def checkMulTags(self, tag, tags):
        if ',' in tag:
            tagList = tag.split(',')
            for tg in tagList:
                tags.append({'tag': tg})
        else:
            tags.append({'tag': tag})

    #Function to get ISSNs if any from the given XML
    def getISSNFromXML(self, root):
        xmlStr = exml.tostring(root, encoding='utf8', method='xml')
        issnrex = re.findall(r'issn[^\d]*[\dX]{4}-?[\dX]{4}', xmlStr,
                             re.IGNORECASE)
        if issnrex:
            log.debug('Found ISSNs')
            if len(issnrex) > 1:  #If more than 1 issns are found
                for s in issnrex:
                    if ('electrón' or 'électron' or 'electron' or 'digital'
                            or 'online') in s:
                        issn = re.search(r'[\dX]{4}-?[\dX]{4}', s)
                        log.debug(issn.group())
                        return issn.group()
            issn = re.search(r'[\dX]{4}-?[\dX]{4}', issnrex[0], re.IGNORECASE)
            log.debug(issn.group())
            return issn.group()
        else:
            return None

    #Function to look up data in CSV converted dict and produce relevant tags
    def produceTag(self, tags, categories, title):
        for c in categories:
            tag = c.attrib['term']
            if (tag != '' or tag != None) and ('kind#post' not in tag.lower()):
                if tag in self.titleStringsDict.keys():
                    tag = self.titleStringsDict[tag]
                else:
                    tag = self.caseConversion(tag)
                #Check if multiple tags separated by ',' exist in the titleStringsDict[tag]
                self.checkMulTags(tag, tags)
                print tags

        for key in self.titleStringsDict.keys():
            try:
                if title != None and key in title.lower():
                    tag = self.titleStringsDict[key]
                    if tag != '':
                        self.checkMulTags(tag, tags)
            except Exception, e:
                pass
#                 log.info("Problem with key:%s" % key)

        if title != None and "open" and (
                "access" or "accesss") and not "partially" in title:
            tags.append({u'tag': "Open Access"})
        elif title != None and "open" and ("access" or
                                           "accesss") and "partially" in title:
            tags.append({u'tag': "Mixed Access"})
        elif title != None and "series" and not "lecture" in title:
            tags.append({u'tag': "Series"})
        print tags
        return tags