def fill_countries(file_path: Union[Path, str]) -> List[Country]: """Creates a list of Country objects from a .csv file.""" rows = get_row(file_path) countries_list: List[Country] = [] for row in rows: nullify(row) countries_list.append(Country(**row)) return countries_list
def country_process() -> List[Country]: """Returns a list of Country objects to be stored in database.""" rows = get_row(dbc.COUNTRIES_FILE) countries_list: List[Country] = [] for row in rows: nullify(row) countries_list.append(Country(**row)) return countries_list
def institution_process(countries: Dict[str, Country]) -> List[Institution]: rows = get_row(dbc.GRID_DATABASE_DIR / "institutes.csv") row_count = csv_size(dbc.GRID_DATABASE_DIR / "institutes.csv") attrs = ["addresses", "acronyms", "aliases", "labels", "links", "types"] # Group the GRID data tables by grid_id for better access. institution_attrs = [ get_csv(dbc.GRID_DATABASE_DIR / f"{attr}.csv", "grid_id") for attr in attrs ] institutions_list: List[Institution] = [] pbar = tqdm(total=row_count) for row in rows: nullify(row) # Get all the data related to the current institution. address, acronym, alias, label, link, type = [ attr.get(row["grid_id"]) for attr in institution_attrs ] # Create 'soup' variable for fuzzy matching of institutions. soup = [row["name"]] if address: country = dbc.country_name_mapper(address[0].pop("country")) institution = Institution(**{**row, **address[0]}) institution.country = countries[country] soup.append(country) else: institution = Institution(**row) if acronym: institution.acronyms = [Acronym(**i) for i in acronym] soup.extend(i["acronym"] for i in acronym) if alias: institution.aliases = [Alias(**i) for i in alias] soup.extend(i["alias"] for i in alias) if label: institution.labels = [Label(**i) for i in label] soup.extend(i["label"] for i in label) if link: institution.links = [Link(**i) for i in link] if type: institution.types = [Type(**i) for i in type] institution.soup = " | ".join(i for i in soup) institutions_list.append(institution) pbar.update() pbar.close() del institution_attrs # Free-up memory (~ 10^5 institutions). return institutions_list
def from_portfolio(cls, portfolio_row): u""" >>> cat = Category.from_portfolio({u'קטגוריה': u'Art and Design אמנות ועיצוב'}) >>> cat.name_he == u'אמנות ועיצוב' and cat.name_en == 'Art and Design' True """ from utils import has_hebrew_chars name_en = [] name_he = [] for word in portfolio_row[u'קטגוריה'].split(' '): if has_hebrew_chars(word): name_he.append(word) else: name_en.append(word) return Category.objects.get_or_create(name_he=nullify(' '.join(name_he)), defaults={'name_en': nullify(' '.join(name_en))})[0]
def ranking_process( db: Session, file_path: Union[Path, str], soup: Dict[str, Dict[str, str]] ) -> Tuple[List[Institution], List[Dict[str, str]], List[Dict[str, str]]]: """Matches institutions with their GRID database counterparts. The functuin reads a .csv file row-by-row and for each row, tries to match the institution to a GRID ID, using various criteria. If that happens with success, the function will then assigns different metrics in the ranking to that institution object. Args: db (Session): SQLAlchemy session instant to connect to the DB file_path (Union[Path, str]): The path to the ranking .csv file soup (Dict[str, Dict[str, str]]): A set of choices for matching institutions Returns: Tuple[List[Institution], List[Dict[str, str]], List[Dict[str, str]]]: Three lists: matched institution, not-matched institutions, and institutions that were matched using the fuzzywuzzy library. """ institutions_list: List[Institution] = [] not_mached_list: List[Dict[str, str]] = [] fuzz_list: List[Dict[str, str]] = [] # useful queries: q1 = db.query(Institution) q2 = q1.join(Institution.country) rows = get_row(file_path) row_count = csv_size(file_path) pbar = tqdm(total=row_count) for row in rows: pbar.update() nullify(row) link_type = row["Ranking System"] inst_name = row["Institution"].strip().lower() inst_country = dbc.country_name_mapper(row["Country"]) inst_url = row["URL"] inst_info = { "Raw": inst_name, "Country": inst_country, "URL": inst_url, "Ranking System": row["Ranking System"], "Ranking Type": row["Ranking Type"], "Year": row["Year"], "Field": row["Field"], "Subject": row["Subject"], } # checking link with institution links inst: Institution = q1.join(Institution.links).filter( Link.link == inst_url, Link.type == link_type).first() # checking grid_id in manual matches if not inst and inst_country in dbc.MATCHES: match = dbc.MATCHES[inst_country].get(inst_name) if match: inst: Institution = q1.filter( Institution.grid_id == match).first() # checking name with institution name if not inst: inst: Institution = q2.filter( func.lower(Institution.name) == inst_name, Country.country == inst_country, ).first() # fuzzy-mataching of strings if not inst: inst_grid_id = fuzzy_matcher(inst_name, inst_country, soup) if inst_grid_id: inst: Institution = q1.filter( Institution.grid_id == inst_grid_id).first() fuzz_list.append({ "Fuzzy": inst.name, "GRID ID": inst_grid_id, **inst_info }) # could not match, or was matched before (with another institution) if not inst or inst in institutions_list: not_mached_list.append({ "Problem": "Not Matched" if not inst else f"Double: {inst}", **inst_info, }) continue ranking_metrics = metrics_process(row) inst_link_types = [link.type.name for link in inst.links] if (link_type not in inst_link_types) and inst_url: inst.links.append(Link(type=link_type, link=inst_url)) inst.rankings.extend(ranking_metrics) institutions_list.append(inst) pbar.close() return (institutions_list, not_mached_list, fuzz_list)
def from_portfolio(cls, portfolio_row): return [(Subject.objects.get_or_create(name_he=nullify(subject.strip())))[0] for subject in portfolio_row[u'נושא'].split(',')]
def from_portfolio(cls, portfolio_row): return Technique.objects.get_or_create( name_he=nullify(portfolio_row[u'טכניקה']), name_en=nullify(portfolio_row['Technique']))[0]
def from_portfolio(cls, portfolio_row): return Client.objects.get_or_create(name_he=nullify(portfolio_row[u'לקוח']), name_en=nullify(portfolio_row['Client']))[0]
def from_portfolio(cls, portfolio_row): return Country.objects.get_or_create(name_he=nullify(portfolio_row[u'ארץ']), name_en=nullify(portfolio_row[u'Country']))[0]